In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
  .appName("ProductSalesAnalysis")\
  .getOrCreate()

spark

LOAD SALES DATA FROM CSV

In [2]:
sales_data = """OrderID,Product,Category,Quantity,UnitPrice,Region
1001,Mobile,Electronics,2,15000,North
1002,Laptop,Electronics,1,55000,South
1003,T-Shirt,Apparel,3,500,East
1004,Jeans,Apparel,2,1200,North
1005,TV,Electronics,1,40000,West
1006,Shoes,Footwear,4,2000,South
1007,Watch,Accessories,2,3000,East
1008,Headphones,Electronics,3,2500,North"""

with open("Sales.csv","w") as file:
  file.write(sales_data)

TOP 5

In [6]:
df = spark.read.csv("Sales.csv",header=True,inferSchema=True)
df.show(5)

+-------+-------+-----------+--------+---------+------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|
+-------+-------+-----------+--------+---------+------+
|   1001| Mobile|Electronics|       2|    15000| North|
|   1002| Laptop|Electronics|       1|    55000| South|
|   1003|T-Shirt|    Apparel|       3|      500|  East|
|   1004|  Jeans|    Apparel|       2|     1200| North|
|   1005|     TV|Electronics|       1|    40000|  West|
+-------+-------+-----------+--------+---------+------+
only showing top 5 rows



NEW COLUMN TOTAL PRICE

In [8]:
from pyspark.sql.functions import col
df = df.withColumn('Total Price',col('Quantity') * col('UnitPrice'))
df.show()

+-------+----------+-----------+--------+---------+------+-----------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|Total Price|
+-------+----------+-----------+--------+---------+------+-----------+
|   1001|    Mobile|Electronics|       2|    15000| North|      30000|
|   1002|    Laptop|Electronics|       1|    55000| South|      55000|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|       1500|
|   1004|     Jeans|    Apparel|       2|     1200| North|       2400|
|   1005|        TV|Electronics|       1|    40000|  West|      40000|
|   1006|     Shoes|   Footwear|       4|     2000| South|       8000|
|   1007|     Watch|Accessories|       2|     3000|  East|       6000|
|   1008|Headphones|Electronics|       3|     2500| North|       7500|
+-------+----------+-----------+--------+---------+------+-----------+



TOTAL REVENUE

In [12]:
from pyspark.sql.functions import sum
df.groupBy('Region').agg(sum('Total Price').alias('Total Revenue')).show()

+------+-------------+
|Region|Total Revenue|
+------+-------------+
| South|        63000|
|  East|         7500|
|  West|        40000|
| North|        39900|
+------+-------------+



CATEGORY WISE REVENUE

In [14]:
df.groupBy('Category').agg(sum('Total Price').alias('Total Revenue')).orderBy(col('Total Revenue').desc()).show()

+-----------+-------------+
|   Category|Total Revenue|
+-----------+-------------+
|Electronics|       132500|
|   Footwear|         8000|
|Accessories|         6000|
|    Apparel|         3900|
+-----------+-------------+



REGION WITH HIGHEST ORDER

In [17]:
from pyspark.sql.functions import count
df.groupBy('Region').agg(count('OrderID').alias('Total Orders')).orderBy(col('Total Orders').desc()).show(1)

+------+------------+
|Region|Total Orders|
+------+------------+
| North|           3|
+------+------------+
only showing top 1 row



AVERAGE UNIT PRICE PER CATEGORY

In [20]:
#from pyspark.sql.functions import mean
df.groupBy('Category').agg(mean('UnitPrice').alias('Average')).show()

+-----------+-------+
|   Category|Average|
+-----------+-------+
|    Apparel|  850.0|
|Electronics|28125.0|
|   Footwear| 2000.0|
|Accessories| 3000.0|
+-----------+-------+



ORDERS MORE THAN TOTPAL OF 30000

In [22]:
df.filter(col('UnitPrice')>30000).show()

+-------+-------+-----------+--------+---------+------+-----------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|Total Price|
+-------+-------+-----------+--------+---------+------+-----------+
|   1002| Laptop|Electronics|       1|    55000| South|      55000|
|   1005|     TV|Electronics|       1|    40000|  West|      40000|
+-------+-------+-----------+--------+---------+------+-----------+



NEW COLUMN

In [24]:
from pyspark.sql.functions  import when
df = df.withColumn('HighValueOrder',when(col('Total Price')>20000,'Yes').otherwise('No'))
df.show()

+-------+----------+-----------+--------+---------+------+-----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|Total Price|HighValueOrder|
+-------+----------+-----------+--------+---------+------+-----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|      30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|      55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|       1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|       2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|      40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|       8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|       6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|       7500|            No|
+-------+----------+-----------+--------+---------+---

HIGH VALUE ORDERS IN NORTH

In [26]:
df.filter((col('HighValueOrder')=='Yes') & (col('Region')=='North')).show()

+-------+-------+-----------+--------+---------+------+-----------+--------------+
|OrderID|Product|   Category|Quantity|UnitPrice|Region|Total Price|HighValueOrder|
+-------+-------+-----------+--------+---------+------+-----------+--------------+
|   1001| Mobile|Electronics|       2|    15000| North|      30000|           Yes|
+-------+-------+-----------+--------+---------+------+-----------+--------------+



HIGH VALUE ORDERS PER REGION

In [32]:
df.filter(col('HighValueOrder')=='Yes').groupBy('Region').agg(count('OrderID').alias('Orders')).show()

+------+------+
|Region|Orders|
+------+------+
| South|     1|
|  West|     1|
| North|     1|
+------+------+



SAVING

In [39]:
df.show()

+-------+----------+-----------+--------+---------+------+-----------+--------------+
|OrderID|   Product|   Category|Quantity|UnitPrice|Region|Total Price|HighValueOrder|
+-------+----------+-----------+--------+---------+------+-----------+--------------+
|   1001|    Mobile|Electronics|       2|    15000| North|      30000|           Yes|
|   1002|    Laptop|Electronics|       1|    55000| South|      55000|           Yes|
|   1003|   T-Shirt|    Apparel|       3|      500|  East|       1500|            No|
|   1004|     Jeans|    Apparel|       2|     1200| North|       2400|            No|
|   1005|        TV|Electronics|       1|    40000|  West|      40000|           Yes|
|   1006|     Shoes|   Footwear|       4|     2000| South|       8000|            No|
|   1007|     Watch|Accessories|       2|     3000|  East|       6000|            No|
|   1008|Headphones|Electronics|       3|     2500| North|       7500|            No|
+-------+----------+-----------+--------+---------+---

In [40]:
high_value_orders = df.filter(df["HighValueOrder"] == "Yes")

In [41]:
high_value_orders.coalesce(1).write.mode("overwrite").option("header", True).csv("high_value_orders")

In [42]:
import shutil
import glob

output_file = glob.glob("high_value_orders/part-*.csv")[0]

shutil.copy(output_file, "high_value_orders.csv")

'high_value_orders.csv'

In [43]:
from google.colab import files
files.download("high_value_orders.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>