In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, when
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType

# Sample Data
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)
df_sales.printSchema()


+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+

root
 |-- OrderID: long (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Items: array (nullable = true)
 |    |-- element: map (containsNull = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContai

In [0]:
from pyspark.sql.functions import explode, col

df_exploded = df_sales.withColumn("Item", explode("Items")) \
                      .withColumn("Product", col("Item.Product")) \
                      .withColumn("Qty", col("Item.Qty").cast("int")) \
                      .drop("Items", "Item")

df_exploded.show(truncate=False)
df_exploded.printSchema()  # Optional: Confirm Qty is IntegerType



+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|101    |Ali     |Asia  |1200.0|Laptop |1  |
|101    |Ali     |Asia  |1200.0|Mouse  |2  |
|102    |Zara    |Europe|650.0 |Tablet |1  |
|103    |Mohan   |Asia  |890.0 |Phone  |2  |
|103    |Mohan   |Asia  |890.0 |Charger|1  |
|104    |Sara    |US    |450.0 |Desk   |1  |
+-------+--------+------+------+-------+---+

root
 |-- OrderID: long (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Product: string (nullable = true)
 |-- Qty: integer (nullable = true)



In [0]:
df_exploded.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|  Phone|       2|
|Charger|       1|
|   Desk|       1|
+-------+--------+



In [0]:
df_sales.groupBy("Region").count().withColumnRenamed("count", "OrderCount").show()


+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+



In [0]:
df_sales = df_sales.withColumn("HighValueOrder", when(col("Amount") > 1000, "Yes").otherwise("No"))
df_sales.select("OrderID", "Customer", "Amount", "HighValueOrder").show()


+-------+--------+------+--------------+
|OrderID|Customer|Amount|HighValueOrder|
+-------+--------+------+--------------+
|    101|     Ali|1200.0|           Yes|
|    102|    Zara| 650.0|            No|
|    103|   Mohan| 890.0|            No|
|    104|    Sara| 450.0|            No|
+-------+--------+------+--------------+



In [0]:
df_sales = df_sales.withColumn(
    "ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Other")
)
df_sales.select("OrderID", "Region", "ShippingZone").show()


+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



In [0]:
df_sales.createOrReplaceTempView("sales_view")


In [0]:
spark.sql("""
SELECT 
    Region,
    COUNT(*) AS OrderCount,
    AVG(Amount) AS AvgAmount
FROM sales_view
GROUP BY Region
""").show()


+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|  Asia|         2|   1045.0|
|Europe|         1|    650.0|
|    US|         1|    450.0|
+------+----------+---------+



In [0]:
# Save as a managed Hive table (requires warehouse directory setup)
df_sales.write.mode("overwrite").saveAsTable("permanent_sales_view")


In [0]:
spark.sql("""
SELECT OrderID, Customer, Region, Amount
FROM sales_view
WHERE size(Items) > 1
""").show()


+-------+--------+------+------+
|OrderID|Customer|Region|Amount|
+-------+--------+------+------+
|    101|     Ali|  Asia|1200.0|
|    103|   Mohan|  Asia| 890.0|
+-------+--------+------+------+



In [0]:
spark.sql("""
SELECT Customer
FROM sales_view
WHERE Amount > 800
""").show()


+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



In [0]:
# You can choose your Azure directory, e.g., "/mnt/mycontainer/final_output/"
df_exploded.write.mode("overwrite").partitionBy("Region").parquet("/mnt/output/sales_by_product")


In [0]:
df_parquet = spark.read.parquet("/mnt/output/sales_by_product")
df_parquet.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|       2|
|Charger|       1|
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|   Desk|       1|
+-------+--------+

