In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SalesDataAnalysis").getOrCreate()

from pyspark.sql import Row
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



In [2]:
from pyspark.sql.functions import explode, sum, countDistinct

# Exercise 1: Flatten Items
df_flat = df_sales.withColumn("Item", explode("Items"))
df_flat = df_flat.select("OrderID", "Item.Product", "Item.Qty", "Region")
df_flat.show()

# Exercise 2: Total quantity per product
df_flat.groupBy("Product").agg(sum("Qty").alias("TotalQuantity")).show()

# Exercise 3: Orders per region
df_sales.groupBy("Region").agg(countDistinct("OrderID").alias("OrderCount")).show()

+-------+-------+---+------+
|OrderID|Product|Qty|Region|
+-------+-------+---+------+
|    101| Laptop|  1|  Asia|
|    101|  Mouse|  2|  Asia|
|    102| Tablet|  1|Europe|
|    103|  Phone|  2|  Asia|
|    103|Charger|  1|  Asia|
|    104|   Desk|  1|    US|
+-------+-------+---+------+

+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
| Laptop|          1.0|
|  Mouse|          2.0|
| Tablet|          1.0|
|   Desk|          1.0|
|  Phone|          2.0|
|Charger|          1.0|
+-------+-------------+

+------+----------+
|Region|OrderCount|
+------+----------+
|Europe|         1|
|    US|         1|
|  Asia|         2|
+------+----------+



In [3]:
from pyspark.sql.functions import when

# Exercise 4: HighValueOrder
df_sales = df_sales.withColumn(
    "HighValueOrder",
    when(df_sales["Amount"] > 1000, "Yes").otherwise("No")
)

# Exercise 5: ShippingZone
df_sales = df_sales.withColumn(
    "ShippingZone",
    when(df_sales["Region"] == "Asia", "Zone A")
    .when(df_sales["Region"] == "Europe", "Zone B")
    .when(df_sales["Region"] == "US", "Zone C")
)

df_sales.show()

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|      Zone B|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|      Zone C|
+-------+--------+--------------------+------+------+--------------+------------+



In [4]:
# Exercise 6: Temporary view
df_sales.createOrReplaceTempView("sales_view")

# Exercise 7: Region stats (SQL)
spark.sql("""
  SELECT Region,
         COUNT(*) AS OrderCount,
         AVG(Amount) AS AvgAmount
  FROM sales_view
  GROUP BY Region
""").show()

# Exercise 8: Permanent table
df_sales.write.mode("overwrite").saveAsTable("sales_permanent_table")

# Exercise 9: Filter orders with >1 item
spark.sql("SELECT * FROM sales_view WHERE size(Items) > 1").show()

# Exercise 10: Customers with Amount > 800
spark.sql("SELECT Customer FROM sales_view WHERE Amount > 800").show()

+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|Europe|         1|    650.0|
|  Asia|         2|   1045.0|
|    US|         1|    450.0|
+------+----------+---------+

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
+-------+--------+--------------------+------+------+--------------+------------+

+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



In [7]:
# Exercise 11: Save as Parquet (partitioned)
df_flat.write.mode("overwrite").partitionBy("Region").parquet("/content/drive/MyDrive/sales_products_partitioned")

# Exercise 12: Read back and group by Product
df_parquet = spark.read.parquet("/content/drive/MyDrive/sales_products_partitioned")
df_parquet.groupBy("Product").agg(sum("Qty").alias("TotalQuantity")).show()

+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
|  Phone|          2.0|
| Laptop|          1.0|
|Charger|          1.0|
|  Mouse|          2.0|
|   Desk|          1.0|
| Tablet|          1.0|
+-------+-------------+



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
