In [0]:
from pyspark.sql import SparkSession
file_path = "/FileStore/tables/retail_sales_dataset.csv"
retail_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(file_path)
retail_df.show()



+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|Transaction ID|      Date|Customer ID|Gender|Age|Product Category|Quantity|Price per Unit|Total Amount|
+--------------+----------+-----------+------+---+----------------+--------+--------------+------------+
|             1|2023-11-24|    CUST001|  Male| 34|          Beauty|       3|            50|         150|
|             2|2023-02-27|    CUST002|Female| 26|        Clothing|       2|           500|        1000|
|             3|2023-01-13|    CUST003|  Male| 50|     Electronics|       1|            30|          30|
|             4|2023-05-21|    CUST004|  Male| 37|        Clothing|       1|           500|         500|
|             5|2023-05-06|    CUST005|  Male| 30|          Beauty|       2|            50|         100|
|             6|2023-04-25|    CUST006|Female| 45|          Beauty|       1|            30|          30|
|             7|2023-03-13|    CUST007|  Male| 46|     

In [0]:
retail_df.createOrReplaceTempView("retail_sales")


In [0]:
%sql
SELECT 'Product Category', SUM('Total Amount') AS Total_Sales
FROM retail_sales
GROUP BY 'Product Category'
ORDER BY Total_Sales DESC;

Product Category,Total_Sales
Electronics,156905
Clothing,155580
Beauty,143515


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 'Customer ID`, SUM(`Total Amount`) AS Total_Spent
FROM retail_sales
GROUP BY `Customer ID`
ORDER BY 'Total_Spent' DESC
LIMIT 5;

Customer ID,Total_Spent
CUST412,2000
CUST072,2000
CUST875,2000
CUST946,2000
CUST093,2000


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 'Gender', AVG('Total Amount`) AS Avg_Sales
FROM retail_sales
GROUP BY 'Gender';

Gender,Avg_Sales
Gender,456.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT CASE 
           WHEN Age < 20 THEN 'Under 20'
                      WHEN Age BETWEEN 20 AND 40 THEN '20-40'
                                 WHEN Age BETWEEN 41 AND 60 THEN '41-60'
                                            ELSE '60+'
                                                   END AS Age_Group,
                                                          AVG(Quantity) AS Avg_Quantity
                                                          FROM retail_sales
                                                          GROUP BY Age_Group;
                                                          

Age_Group,Avg_Quantity
20-40,2.535377358490566
60+,2.419354838709677
Under 20,2.642857142857143
41-60,2.5011337868480727


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 'Gender', SUM('Total Amount`) AS Total_Sales,
(SUM(`Total Amount`) * 100 / (SELECT SUM(`Total Amount`) FROM retail_sales)) AS Contribution_Percentage
FROM retail_sales
GROUP BY 'Gender'
ORDER BY Contribution_Percentage DESC;

Gender,Total_Sales,Contribution_Percentage
Gender,456000,100.0


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 'Product Category`, Quantity
FROM retail_sales
LIMIT 10;


Product Category,Quantity
Beauty,3
Clothing,2
Electronics,1
Clothing,1
Beauty,2
Beauty,1
Clothing,2
Electronics,4
Electronics,2
Clothing,4


Databricks visualization. Run in Databricks to view.

In [0]:
%sql
SELECT 'Product Category`, SUM(Quantity) AS Total_Quantity
FROM retail_sales
WHERE Quantity IS NOT NULL
GROUP BY `Product Category`
ORDER BY Total_Quantity DESC
LIMIT 1;


Product Category,Total_Quantity
Clothing,894


Databricks visualization. Run in Databricks to view.