In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, when
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SalesDataProcessing") \
    .enableHiveSupport() \
    .getOrCreate()

data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}],
        Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}],
        Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}],
        Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}],
        Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)


+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}]                               |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



Working with JSON & Nested Fields

In [0]:
1. Flatten the Items array using explode()

In [0]:
from pyspark.sql.functions import explode

df_exploded = df_sales.withColumn("Item", explode("Items"))
df_flat = df_exploded.select("OrderID", "Customer", "Region", "Amount",
                             col("Item.Product").alias("Product"),
                             col("Item.Qty").alias("Qty"))
df_flat.show()


+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|    101|     Ali|  Asia|1200.0| Laptop|  1|
|    102|    Zara|Europe| 650.0| Tablet|  1|
|    103|   Mohan|  Asia| 890.0|  Phone|  2|
|    103|   Mohan|  Asia| 890.0|Charger|  1|
|    104|    Sara|    US| 450.0|   Desk|  1|
+-------+--------+------+------+-------+---+



2. Count total quantity sold per product.

In [0]:
from pyspark.sql import functions as F

df_flat.groupBy("Product").agg(F.sum(F.col("Qty").cast("double")).alias("TotalQty")).show()

+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|     1.0|
| Tablet|     1.0|
|  Phone|     2.0|
|Charger|     1.0|
|   Desk|     1.0|
+-------+--------+



3. Count number of orders per region.

In [0]:
df_sales.groupBy("Region").count().withColumnRenamed("count", "OrderCount").show()


+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+



Using when and otherwise

4. Create a new column HighValueOrder

In [0]:
df_sales = df_sales.withColumn("HighValueOrder",
                               when(col("Amount") > 1000, "Yes").otherwise("No"))
df_sales.select("OrderID", "Amount", "HighValueOrder").show()


+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+



5. Add a column ShippingZone

In [0]:
df_sales = df_sales.withColumn("ShippingZone", when(col("Region") == "Asia", "Zone A")
                               .when(col("Region") == "Europe", "Zone B")
                               .when(col("Region") == "US", "Zone C")
                               .otherwise("Unknown"))
df_sales.select("OrderID", "Region", "ShippingZone").show()


+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



Temporary & Permanent Views

6. Register df_sales as a temporary view named sales_view .

In [0]:
df_sales.createOrReplaceTempView("sales_view")


7. SQL Query: Count & Average by Region

In [0]:
# Count orders by Region
spark.sql("SELECT Region, COUNT(*) as OrderCount FROM sales_view GROUP BY Region").show()

# Average amount per region
spark.sql("SELECT Region, AVG(Amount) as AvgAmount FROM sales_view GROUP BY Region").show()


+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+

+------+---------+
|Region|AvgAmount|
+------+---------+
|  Asia|   1045.0|
|Europe|    650.0|
|    US|    450.0|
+------+---------+



8. Create a permanent view using saveAsTable() .

In [0]:
df_sales.write.mode("overwrite").saveAsTable("sales_table")

In [0]:
spark.sql("SELECT * FROM sales_table").show()


+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|      Zone B|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|      Zone C|
+-------+--------+--------------------+------+------+--------------+------------+



SQL Queries via Spark

9. Use SQL to filter all orders with more than 1 item.

In [0]:
spark.sql("""
    SELECT OrderID, Customer, size(Items) as ItemCount
    FROM sales_view
    WHERE size(Items) > 1
""").show()


+-------+--------+---------+
|OrderID|Customer|ItemCount|
+-------+--------+---------+
|    103|   Mohan|        2|
+-------+--------+---------+



10. Use SQL to extract customer names where Amount > 800.

In [0]:
spark.sql("SELECT Customer FROM sales_view WHERE Amount > 800").show()


+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



Saving as Parquet and Reading Again

11. Save the exploded product-level DataFrame

In [0]:
df_flat.write.mode("overwrite").partitionBy("Region").parquet("/tmp/sales_products_parquet")


12. Read the parquet back and perform a group-by on Product .

In [0]:
df_parquet = spark.read.parquet("/tmp/sales_products_parquet")
df_parquet.withColumn("Qty", df_parquet["Qty"].cast("int")).groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|       2|
|Charger|       1|
| Tablet|       1|
| Laptop|       1|
|   Desk|       1|
+-------+--------+

