In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# /FileStore/tables/food_sales_data.csv
# /FileStore/tables/food_menu_data.csv

In [0]:
# create a spark session so we can work with spark functionality
sales_spark = SparkSession.builder.appName("Analysing Food sales data").getOrCreate()
print(sales_spark)

<pyspark.sql.session.SparkSession object at 0x7f7828ae7d30>


In [0]:
# creating a schema for our dataset
columns = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("customer_id", StringType(), True),
    StructField("order_date", DateType(), True),
    StructField("location", StringType(), True),
    StructField("source_order", StringType(), True)
])

# creating a dataFrame using created schema
sales_df = sales_spark.read.format("csv")\
    .option("inferschema", True)\
    .option("header", True)\
    .schema(columns)\
    .load("/FileStore/tables/food_sales_data.csv")

sales_df.show(10)
sales_df.printSchema()

+----------+-----------+----------+--------+------------+
|product_id|customer_id|order_date|location|source_order|
+----------+-----------+----------+--------+------------+
|         2|          A|2022-01-01|   India|      Swiggy|
|         2|          A|2023-01-07|   India|      Swiggy|
|         3|          A|2023-01-10|   India|  Restaurant|
|         3|          A|2022-01-11|   India|      Swiggy|
|         3|          A|2023-01-11|   India|  Restaurant|
|         2|          B|2022-02-01|   India|      Swiggy|
|         2|          B|2023-01-02|   India|      Swiggy|
|         1|          B|2023-01-04|   India|  Restaurant|
|         1|          B|2023-02-11|   India|      Swiggy|
|         3|          B|2023-01-16|   India|      zomato|
+----------+-----------+----------+--------+------------+
only showing top 10 rows

root
 |-- product_id: integer (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- location: string (nullable

In [0]:
# tranformation - we have to separate month, day, & year from date column
sales_df_T = sales_df.withColumn("order_month", month(col("order_date")))\
    .withColumn("order_year", year(col("order_date")))\
    .withColumn("order_quarter", quarter(col("order_date")))

# display(sales_df_T)
sales_df_T.limit(10).display()
sales_df_T.printSchema()

product_id,customer_id,order_date,location,source_order,order_month,order_year,order_quarter
2,A,2022-01-01,India,Swiggy,1,2022,1
2,A,2023-01-07,India,Swiggy,1,2023,1
3,A,2023-01-10,India,Restaurant,1,2023,1
3,A,2022-01-11,India,Swiggy,1,2022,1
3,A,2023-01-11,India,Restaurant,1,2023,1
2,B,2022-02-01,India,Swiggy,2,2022,1
2,B,2023-01-02,India,Swiggy,1,2023,1
1,B,2023-01-04,India,Restaurant,1,2023,1
1,B,2023-02-11,India,Swiggy,2,2023,1
3,B,2023-01-16,India,zomato,1,2023,1


root
 |-- product_id: integer (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- location: string (nullable = true)
 |-- source_order: string (nullable = true)
 |-- order_month: integer (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_quarter: integer (nullable = true)



In [0]:
#  Now we are creating a dataFrame for menu_data
# creating a schema for our dataset
columns1 = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("price", StringType(), True)
])

# creating a dataFrame using created schema
menu_df = sales_spark.read.format("csv")\
    .option("inferschema", True)\
    .option("header", True)\
    .schema(columns1)\
    .load("/FileStore/tables/food_menu_data.csv")

menu_df.show()
menu_df.printSchema()

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         2|     Chowmin|  150|
|         3|    sandwich|  120|
|         4|        Dosa|  110|
|         5|     Biryani|   80|
|         6|       Pasta|  180|
+----------+------------+-----+

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: string (nullable = true)



# Getting insights from above sales and menu datasets
### we have dataframe for sales -> sales_df_T
### for menu -> menu_df

In [0]:
# before staring with SparkSQL we need to create a temp view
sales_df_T.createOrReplaceTempView("sales_data")
menu_df.createOrReplaceTempView("menu_data")

In [0]:
%sql select * from sales_data limit 5;

product_id,customer_id,order_date,location,source_order,order_month,order_year,order_quarter
2,A,2022-01-01,India,Swiggy,1,2022,1
2,A,2023-01-07,India,Swiggy,1,2023,1
3,A,2023-01-10,India,Restaurant,1,2023,1
3,A,2022-01-11,India,Swiggy,1,2022,1
3,A,2023-01-11,India,Restaurant,1,2023,1


In [0]:
%sql select * from menu_data;

product_id,product_name,price
2,Chowmin,150
3,sandwich,120
4,Dosa,110
5,Biryani,80
6,Pasta,180


##Total amount spent by each customer

In [0]:
%sql    select customer_id as customer, sum(price) as amount_spend from sales_data as s
        join menu_data as m
        where s.product_id == m.product_id
        group by customer
        order by customer;

customer,amount_spend
A,3960.0
B,3240.0
C,1800.0
D,1200.0
E,2040.0


Databricks visualization. Run in Databricks to view.

In [0]:
# with dataframe functionality
total_amount_spent = sales_df_T.join(menu_df, "product_id").groupBy("customer_id").agg({"price":"sum"})\
                        .orderBy("customer_id")
display(total_amount_spent)

customer_id,sum(price)
A,3960.0
B,3240.0
C,1800.0
D,1200.0
E,2040.0


##Total amount spent on each food category

In [0]:
%sql      select product_name as category, sum(price) from sales_data as s
          join menu_data as m
          where s.product_id == m.product_id
          group by category
          order by category desc;

category,sum(price)
sandwich,5760.0
Pasta,1080.0
Dosa,1320.0
Chowmin,3600.0
Biryani,480.0


In [0]:
# with dataframe functionality
total_amount_spent_product = sales_df_T.join(menu_df, "product_id").groupBy("product_name").agg({"price":"sum"})\
                        .orderBy("product_name")
display(total_amount_spent_product)

product_name,sum(price)
Biryani,480.0
Chowmin,3600.0
Dosa,1320.0
Pasta,1080.0
sandwich,5760.0


## Total amount of sale in each month

In [0]:
%sql select count(distinct order_month) from sales_data; 

count(DISTINCT order_month)
7


In [0]:
%sql        select order_month as month, sum(price) from sales_data as s
            join menu_data as m
            where s.product_id == m.product_id
            group by month
            order by month;

month,sum(price)
1,2460.0
2,2430.0
3,810.0
5,2460.0
6,2460.0
7,810.0
11,810.0


In [0]:
total_amount_spent = sales_df_T.join(menu_df, "product_id").groupBy("order_month").agg({"price":"sum"})\
                        .orderBy("order_month")
display(total_amount_spent)

order_month,sum(price)
1,2460.0
2,2430.0
3,810.0
5,2460.0
6,2460.0
7,810.0
11,810.0


## Total amount of sale by year

In [0]:
%sql            select order_year as year, sum(price) as total_sale from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id
                group by order_year
                order by order_year;

year,total_sale
2022,4350.0
2023,7890.0


## Total amount of sale by quarter

In [0]:
%sql            select order_quarter as quarter, sum(price) as total_sale from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id
                group by order_quarter
                order by order_quarter;

quarter,total_sale
1,5700.0
2,4920.0
3,810.0
4,810.0


## How many times each product get purchased

In [0]:
%sql            select m.product_name as product, count(s.product_id) as purchased_count from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id
                group by m.product_name
                order by count(s.product_id) desc;

product,purchased_count
sandwich,48
Chowmin,24
Dosa,12
Pasta,6
Biryani,6


Databricks visualization. Run in Databricks to view.

## Total sales by each country

In [0]:
%sql            select s.location as country, sum(price) as total_sales from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id
                group by country
                order by sum(price) desc;

country,total_sales
UK,6120.0
India,3960.0
USA,2160.0


Databricks visualization. Run in Databricks to view.

## Total sales by each source of order

In [0]:
%sql            select s.source_order as source_order, sum(price) as total_sales from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id
                group by source_order
                order by sum(price) desc;

source_order,total_sales
zomato,4920.0
Swiggy,4830.0
Restaurant,2490.0


Databricks visualization. Run in Databricks to view.

## frequecy of customer visited to store OR order food

In [0]:
%sql            select s.customer_id as customer, count(distinct order_date) as frequecy_count from sales_data as s
                join menu_data as m
                where s.product_id == m.product_id and source_order == "Restaurant"
                group by customer
                order by frequecy_count desc;

customer,frequecy_count
A,6
E,5
C,3
D,1


Databricks visualization. Run in Databricks to view.