# Importing libraries

In [1]:
import os
import sys
import glob
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession,functions, Window
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pyspark.pandas as ps


# Starting Spark Session and Creating a Spark Session Called Pizza

In [3]:
spark = SparkSession.builder \
        .master("local[1]") \
        .appName("PizzaRestaurant") \
        .config("spark.some.config.option", "config-value") \
        .enableHiveSupport() \
        .getOrCreate()

In [4]:
spark

# Browsing .csv PizzaRestaurant files

In [5]:
path = r"D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales"


if os.path.exists(path):
    print(f'The file {path} exists')
else: 
    print(f"The file {path} does not exist ")

filenames = glob.glob(path + '\*.csv')

for file in filenames:
    print("\nFile names:", file)


The file D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales exists

File names: D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\orders.csv

File names: D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\order_details.csv

File names: D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\pizzas.csv

File names: D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\pizza_types.csv


# Reading csv files into Spark DataFrames

In [6]:
PathOrders = 'D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\orders.csv'
PathOrderDetails = 'D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\order_details.csv'
PathPizzas = 'D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\pizzas.csv'
PathPizzaTypes = 'D:\My commercial projects\PySpark\Project 4 PySpark\Pizza Place Sales\pizza_types.csv'

dfPizzaOrders = spark.read.csv(PathOrders, header=True, inferSchema=True)
dfPizzaOrderDetails = spark.read.csv(PathOrderDetails, header=True, inferSchema=True)
dfPizzas = spark.read.csv(PathPizzas, header=True, inferSchema=True)
dfPizzaTypes = spark.read.csv(PathPizzaTypes, header=True, inferSchema=True)


# Data Exploration

In [7]:
dfPizzaOrders.show(truncate=False)

+--------+----------+-------------------+
|order_id|date      |time               |
+--------+----------+-------------------+
|1       |2015-01-01|2023-06-05 11:38:36|
|2       |2015-01-01|2023-06-05 11:57:40|
|3       |2015-01-01|2023-06-05 12:12:28|
|4       |2015-01-01|2023-06-05 12:16:31|
|5       |2015-01-01|2023-06-05 12:21:30|
|6       |2015-01-01|2023-06-05 12:29:36|
|7       |2015-01-01|2023-06-05 12:50:37|
|8       |2015-01-01|2023-06-05 12:51:37|
|9       |2015-01-01|2023-06-05 12:52:01|
|10      |2015-01-01|2023-06-05 13:00:15|
|11      |2015-01-01|2023-06-05 13:02:59|
|12      |2015-01-01|2023-06-05 13:04:41|
|13      |2015-01-01|2023-06-05 13:11:55|
|14      |2015-01-01|2023-06-05 13:14:19|
|15      |2015-01-01|2023-06-05 13:33:00|
|16      |2015-01-01|2023-06-05 13:34:07|
|17      |2015-01-01|2023-06-05 13:53:00|
|18      |2015-01-01|2023-06-05 13:57:08|
|19      |2015-01-01|2023-06-05 13:59:09|
|20      |2015-01-01|2023-06-05 14:03:08|
+--------+----------+-------------

In [8]:
dfPizzaOrderDetails.show(truncate=False)

+----------------+--------+--------------+--------+
|order_details_id|order_id|pizza_id      |quantity|
+----------------+--------+--------------+--------+
|1               |1       |hawaiian_m    |1       |
|2               |2       |classic_dlx_m |1       |
|3               |2       |five_cheese_l |1       |
|4               |2       |ital_supr_l   |1       |
|5               |2       |mexicana_m    |1       |
|6               |2       |thai_ckn_l    |1       |
|7               |3       |ital_supr_m   |1       |
|8               |3       |prsc_argla_l  |1       |
|9               |4       |ital_supr_m   |1       |
|10              |5       |ital_supr_m   |1       |
|11              |6       |bbq_ckn_s     |1       |
|12              |6       |the_greek_s   |1       |
|13              |7       |spinach_supr_s|1       |
|14              |8       |spinach_supr_s|1       |
|15              |9       |classic_dlx_s |1       |
|16              |9       |green_garden_s|1       |
|17         

In [9]:
dfPizzas.show(truncate=False)

+-------------+-------------+----+-----+
|pizza_id     |pizza_type_id|size|price|
+-------------+-------------+----+-----+
|bbq_ckn_s    |bbq_ckn      |S   |12.75|
|bbq_ckn_m    |bbq_ckn      |M   |16.75|
|bbq_ckn_l    |bbq_ckn      |L   |20.75|
|cali_ckn_s   |cali_ckn     |S   |12.75|
|cali_ckn_m   |cali_ckn     |M   |16.75|
|cali_ckn_l   |cali_ckn     |L   |20.75|
|ckn_alfredo_s|ckn_alfredo  |S   |12.75|
|ckn_alfredo_m|ckn_alfredo  |M   |16.75|
|ckn_alfredo_l|ckn_alfredo  |L   |20.75|
|ckn_pesto_s  |ckn_pesto    |S   |12.75|
|ckn_pesto_m  |ckn_pesto    |M   |16.75|
|ckn_pesto_l  |ckn_pesto    |L   |20.75|
|southw_ckn_s |southw_ckn   |S   |12.75|
|southw_ckn_m |southw_ckn   |M   |16.75|
|southw_ckn_l |southw_ckn   |L   |20.75|
|thai_ckn_s   |thai_ckn     |S   |12.75|
|thai_ckn_m   |thai_ckn     |M   |16.75|
|thai_ckn_l   |thai_ckn     |L   |20.75|
|big_meat_s   |big_meat     |S   |12.0 |
|big_meat_m   |big_meat     |M   |16.0 |
+-------------+-------------+----+-----+
only showing top

In [10]:
dfPizzaTypes.show(truncate=False)

+-------------+------------------------------------------+--------+--------------------------------------------------------------------------------------------+
|pizza_type_id|name                                      |category|ingredients                                                                                 |
+-------------+------------------------------------------+--------+--------------------------------------------------------------------------------------------+
|bbq_ckn      |The Barbecue Chicken Pizza                |Chicken |Barbecued Chicken, Red Peppers, Green Peppers, Tomatoes, Red Onions, Barbecue Sauce         |
|cali_ckn     |The California Chicken Pizza              |Chicken |Chicken, Artichoke, Spinach, Garlic, Jalapeno Peppers, Fontina Cheese, Gouda Cheese         |
|ckn_alfredo  |The Chicken Alfredo Pizza                 |Chicken |Chicken, Red Onions, Red Peppers, Mushrooms, Asiago Cheese, Alfredo Sauce                   |
|ckn_pesto    |The Chicken Pesto P

# Combining DataFrames

In [11]:
dfPizzaOrdersAll = dfPizzaOrders.join(dfPizzaOrderDetails,dfPizzaOrders.order_id == dfPizzaOrderDetails.order_id, "inner" )
dfPizzaOrdersAll.show(truncate=False)

+--------+----------+-------------------+----------------+--------+--------------+--------+
|order_id|date      |time               |order_details_id|order_id|pizza_id      |quantity|
+--------+----------+-------------------+----------------+--------+--------------+--------+
|1       |2015-01-01|2023-06-05 11:38:36|1               |1       |hawaiian_m    |1       |
|2       |2015-01-01|2023-06-05 11:57:40|2               |2       |classic_dlx_m |1       |
|2       |2015-01-01|2023-06-05 11:57:40|3               |2       |five_cheese_l |1       |
|2       |2015-01-01|2023-06-05 11:57:40|4               |2       |ital_supr_l   |1       |
|2       |2015-01-01|2023-06-05 11:57:40|5               |2       |mexicana_m    |1       |
|2       |2015-01-01|2023-06-05 11:57:40|6               |2       |thai_ckn_l    |1       |
|3       |2015-01-01|2023-06-05 12:12:28|7               |3       |ital_supr_m   |1       |
|3       |2015-01-01|2023-06-05 12:12:28|8               |3       |prsc_argla_l 

In [12]:
dfPizzaOrdersAll = dfPizzaOrdersAll.select("pizza_id","quantity","date","time")
dfPizzaOrdersAll.show()

+--------------+--------+----------+-------------------+
|      pizza_id|quantity|      date|               time|
+--------------+--------+----------+-------------------+
|    hawaiian_m|       1|2015-01-01|2023-06-05 11:38:36|
| classic_dlx_m|       1|2015-01-01|2023-06-05 11:57:40|
| five_cheese_l|       1|2015-01-01|2023-06-05 11:57:40|
|   ital_supr_l|       1|2015-01-01|2023-06-05 11:57:40|
|    mexicana_m|       1|2015-01-01|2023-06-05 11:57:40|
|    thai_ckn_l|       1|2015-01-01|2023-06-05 11:57:40|
|   ital_supr_m|       1|2015-01-01|2023-06-05 12:12:28|
|  prsc_argla_l|       1|2015-01-01|2023-06-05 12:12:28|
|   ital_supr_m|       1|2015-01-01|2023-06-05 12:16:31|
|   ital_supr_m|       1|2015-01-01|2023-06-05 12:21:30|
|     bbq_ckn_s|       1|2015-01-01|2023-06-05 12:29:36|
|   the_greek_s|       1|2015-01-01|2023-06-05 12:29:36|
|spinach_supr_s|       1|2015-01-01|2023-06-05 12:50:37|
|spinach_supr_s|       1|2015-01-01|2023-06-05 12:51:37|
| classic_dlx_s|       1|2015-0

In [13]:
dfPizzaAll = dfPizzas.join(dfPizzaTypes,dfPizzas.pizza_type_id == dfPizzaTypes.pizza_type_id, "inner" )
dfPizzaAll = dfPizzaAll.select("pizza_id","size","price","name","category","ingredients")
dfPizzaAll.show()


+-------------+----+-----+--------------------+--------+--------------------+
|     pizza_id|size|price|                name|category|         ingredients|
+-------------+----+-----+--------------------+--------+--------------------+
|    bbq_ckn_l|   L|20.75|The Barbecue Chic...| Chicken|Barbecued Chicken...|
|    bbq_ckn_m|   M|16.75|The Barbecue Chic...| Chicken|Barbecued Chicken...|
|    bbq_ckn_s|   S|12.75|The Barbecue Chic...| Chicken|Barbecued Chicken...|
|   cali_ckn_l|   L|20.75|The California Ch...| Chicken|Chicken, Artichok...|
|   cali_ckn_m|   M|16.75|The California Ch...| Chicken|Chicken, Artichok...|
|   cali_ckn_s|   S|12.75|The California Ch...| Chicken|Chicken, Artichok...|
|ckn_alfredo_l|   L|20.75|The Chicken Alfre...| Chicken|Chicken, Red Onio...|
|ckn_alfredo_m|   M|16.75|The Chicken Alfre...| Chicken|Chicken, Red Onio...|
|ckn_alfredo_s|   S|12.75|The Chicken Alfre...| Chicken|Chicken, Red Onio...|
|  ckn_pesto_l|   L|20.75|The Chicken Pesto...| Chicken|Chicken,

In [14]:
dfPizza = dfPizzaAll.join(dfPizzaOrdersAll,dfPizzaAll.pizza_id == dfPizzaOrdersAll.pizza_id, "left" )
dfPizza = dfPizza.select(col("name").alias("Name"), 
                col("category").alias("Category"),
                col("ingredients").alias("Ingredients"),
                col("size").alias("Size"),
                col("price").alias("Price"),
                col("quantity").alias("Quantity"),
                col("date").alias("Date"),
                col("time").alias("Time"))

dfPizza.show(truncate=False)


+--------------------------+--------+-----------------------------------------------------------------------------------+----+-----+--------+----------+-------------------+
|Name                      |Category|Ingredients                                                                        |Size|Price|Quantity|Date      |Time               |
+--------------------------+--------+-----------------------------------------------------------------------------------+----+-----+--------+----------+-------------------+
|The Barbecue Chicken Pizza|Chicken |Barbecued Chicken, Red Peppers, Green Peppers, Tomatoes, Red Onions, Barbecue Sauce|L   |20.75|1       |2015-01-01|2023-06-05 13:02:59|
|The Barbecue Chicken Pizza|Chicken |Barbecued Chicken, Red Peppers, Green Peppers, Tomatoes, Red Onions, Barbecue Sauce|L   |20.75|1       |2015-01-01|2023-06-05 13:53:00|
|The Barbecue Chicken Pizza|Chicken |Barbecued Chicken, Red Peppers, Green Peppers, Tomatoes, Red Onions, Barbecue Sauce|L   |20.75|1  

# Data Cleaning

### Looking for null values

In [15]:
dfPizzaNull = dfPizza.select("Name", "Category", "Ingredients", "Size", "Price", "Quantity")

dfPizzaNull.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in dfPizzaNull.columns]
   ).show()

+----+--------+-----------+----+-----+--------+
|Name|Category|Ingredients|Size|Price|Quantity|
+----+--------+-----------+----+-----+--------+
|   0|       0|          0|   0|    0|       5|
+----+--------+-----------+----+-----+--------+



##### There are a few Null values in Quantity, but it means that there were no orders for these pizza

In [16]:
dfPizza.filter(col("Quantity").isNull()).show()

+--------------------+--------+--------------------+----+-----+--------+----+----+
|                Name|Category|         Ingredients|Size|Price|Quantity|Date|Time|
+--------------------+--------+--------------------+----+-----+--------+----+----+
|The Four Cheese P...|  Veggie|Ricotta Cheese, G...|   S|11.75|    null|null|null|
|  The Big Meat Pizza| Classic|Bacon, Pepperoni,...|   L| 20.5|    null|null|null|
|The Five Cheese P...|  Veggie|Mozzarella Cheese...|   S| 12.5|    null|null|null|
|  The Big Meat Pizza| Classic|Bacon, Pepperoni,...|   M| 16.0|    null|null|null|
|The Five Cheese P...|  Veggie|Mozzarella Cheese...|   M| 15.5|    null|null|null|
+--------------------+--------+--------------------+----+-----+--------+----+----+



### Dealing with duplicate values

In [17]:
print("Counting rows {}".format(dfPizza.count()))

Counting rows 48625


In [18]:
dfPizzaDuplicates = dfPizza.groupBy(dfPizza.columns).count().filter(col("count") > 1)
dfPizzaDuplicates.show()

+----+--------+-----------+----+-----+--------+----+----+-----+
|Name|Category|Ingredients|Size|Price|Quantity|Date|Time|count|
+----+--------+-----------+----+-----+--------+----+----+-----+
+----+--------+-----------+----+-----+--------+----+----+-----+



### Browsing for distinct values in each column

In [19]:
print("Counting distinct rows {}".format(dfPizza.distinct().count()))

Counting distinct rows 48625


In [20]:
dfPizza.agg(*(countDistinct(col(c)).alias(c) for c in dfPizza.columns)).show()

+----+--------+-----------+----+-----+--------+----+-----+
|Name|Category|Ingredients|Size|Price|Quantity|Date| Time|
+----+--------+-----------+----+-----+--------+----+-----+
|  32|       4|         32|   5|   27|       4| 358|16382|
+----+--------+-----------+----+-----+--------+----+-----+



### Columns

In [21]:
print("There are {} columns in the DataFrame and their names are {}".format(len(dfPizza.columns),
                                                                              dfPizza.columns))

There are 8 columns in the DataFrame and their names are ['Name', 'Category', 'Ingredients', 'Size', 'Price', 'Quantity', 'Date', 'Time']


### Datatypes

In [22]:
dfPizza.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Ingredients: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)



In [23]:
dfPizza = dfPizza.withColumn("Price",col("Price").cast(DecimalType(18,2)))

### Summary

In [24]:
dfPizza.describe("Price").show()

+-------+-----------------+
|summary|            Price|
+-------+-----------------+
|  count|            48625|
|   mean|        16.494004|
| stddev|3.621761197368334|
|    min|             9.75|
|    max|            35.95|
+-------+-----------------+



In [25]:
dfPizza.dtypes

[('Name', 'string'),
 ('Category', 'string'),
 ('Ingredients', 'string'),
 ('Size', 'string'),
 ('Price', 'decimal(18,2)'),
 ('Quantity', 'int'),
 ('Date', 'date'),
 ('Time', 'timestamp')]

In [26]:
print("Shape of DataFrame: {}, {} ".format(dfPizza.count(),
                                          len(dfPizza.columns)))

Shape of DataFrame: 48625, 8 


# Data Analysis

In [27]:
#Things to be considered in next project
# browsing all materials that I have so as to select!!
## using UDF (put  example and remove digit at the beginning) & analytical functions
#https://sparkbyexamples.com/pyspark/pyspark-groupby-explained-with-example/
# https://sparkbyexamples.com/pyspark/pyspark-window-functions/
# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/ 
# Considering next transformations to be used
# max and min of each borough 
## using pandas / pivot
# using inspirations to run this project from .doc files 
# https://pub.towardsai.net/exploratory-data-analysis-eda-using-pyspark-b43e71fcec9f
#https://dev.to/kinyungu_denis/apache-pyspark-for-data-engineering-3phi