# Spark SQL Exercise Set – Product Orders Analytics

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("OrderDataPrep") \
    .getOrCreate()


In [39]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from datetime import date

data = [
    (101, "Alice", "Smartphone", "Electronics", 1, 6000, "2025-07-01"),
    (102, "Bob", "Jeans", "Clothing", 2, 4000, "2025-07-02"),
    (103, "Charlie", "Sofa", "Furniture", 2, 12000, "2025-07-03"),
    (104, "Daisy", "Laptop", "Electronics", 7, 10000, "2025-07-04"),
    (105, "Eve", "T-Shirt", "Clothing", 3, 25000, "2025-07-05"),
    (106, "Frank", "Bookshelf", "Furniture", 15000, 300, "2023-01-06"),
    (107, "Grace", "Novel", "Books", 2, 15000, "2025-07-07"),
    (108, "Heidi", "Tablet", "Electronics", 4, 40000, "2025-07-08"),
    (109, "Ivan", "Blazer", "Clothing", 1, 60000, "2025-07-09"),
    (110, "Judy", "Chair", "Furniture", 4, 150000, "2023-01-10"),
    (111, "Ken", "Textbook", "Books", 1, 80000, "2025-07-11"),
    (112, "Leo", "Smartwatch", "Electronics", 3, 25000, "2023-01-12"),
    (113, "Leo", "Chair", "Furnitures", 3, 25000, "2023-01-12")
]

schema = StructType([
    StructField("OrderID", IntegerType(), True),
    StructField("CustomerName", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("UnitPrice", IntegerType(), True),
    StructField("OrderDate", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df.show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    101|       Alice|Smartphone|Electronics|       1|     6000|2025-07-01|
|    102|         Bob|     Jeans|   Clothing|       2|     4000|2025-07-02|
|    103|     Charlie|      Sofa|  Furniture|       2|    12000|2025-07-03|
|    104|       Daisy|    Laptop|Electronics|       7|    10000|2025-07-04|
|    105|         Eve|   T-Shirt|   Clothing|       3|    25000|2025-07-05|
|    106|       Frank| Bookshelf|  Furniture|   15000|      300|2023-01-06|
|    107|       Grace|     Novel|      Books|       2|    15000|2025-07-07|
|    108|       Heidi|    Tablet|Electronics|       4|    40000|2025-07-08|
|    109|        Ivan|    Blazer|   Clothing|       1|    60000|2025-07-09|
|    110|        Judy|     Chair|  Furniture|       4|   150000|2023-01-10|
|    111|   

In [40]:
df.createOrReplaceTempView("orders_local")


In [41]:
df.createOrReplaceGlobalTempView("orders_global")


# Part A: Local View – orders_local
1. List all orders placed for "Electronics" with a Quantity of 2 or more.
2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
3. Show the total number of orders per Category .
4. List orders placed in "January 2023" only.
5. Show the average UnitPrice per category.
6. Find the order with the highest total amount.
7. Drop the local view and try querying it again.

In [42]:
#List all orders placed for "Electronics" with a Quantity of 2 or more.
spark.sql('''
    SELECT *
    FROM orders_local
    WHERE Category = 'Electronics' AND Quantity >= 2
''').show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    104|       Daisy|    Laptop|Electronics|       7|    10000|2025-07-04|
|    108|       Heidi|    Tablet|Electronics|       4|    40000|2025-07-08|
|    112|         Leo|Smartwatch|Electronics|       3|    25000|2023-01-12|
+-------+------------+----------+-----------+--------+---------+----------+



In [43]:
# Calculate TotalAmount (Quantity × UnitPrice) for each order.
spark.sql('''
    SELECT *,
           Quantity * UnitPrice AS TotalAmount
    FROM orders_local
''').show()


+-------+------------+----------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+----------+-----------+--------+---------+----------+-----------+
|    101|       Alice|Smartphone|Electronics|       1|     6000|2025-07-01|       6000|
|    102|         Bob|     Jeans|   Clothing|       2|     4000|2025-07-02|       8000|
|    103|     Charlie|      Sofa|  Furniture|       2|    12000|2025-07-03|      24000|
|    104|       Daisy|    Laptop|Electronics|       7|    10000|2025-07-04|      70000|
|    105|         Eve|   T-Shirt|   Clothing|       3|    25000|2025-07-05|      75000|
|    106|       Frank| Bookshelf|  Furniture|   15000|      300|2023-01-06|    4500000|
|    107|       Grace|     Novel|      Books|       2|    15000|2025-07-07|      30000|
|    108|       Heidi|    Tablet|Electronics|       4|    40000|2025-07-08|     160000|
|    109|        Ivan|    Blazer

In [44]:
#Show the total number of orders per Category .
spark.sql('''
    SELECT Category,
           COUNT(*) AS TotalOrders
    FROM orders_local
    GROUP BY Category
''').show()


+-----------+-----------+
|   Category|TotalOrders|
+-----------+-----------+
|Electronics|          4|
|   Clothing|          3|
|  Furniture|          3|
|      Books|          2|
| Furnitures|          1|
+-----------+-----------+



In [45]:
#List orders placed in "January 2023" only.
spark.sql('''
    SELECT *
    FROM orders_local
    WHERE OrderDate LIKE '2023-01-%'
''').show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|    106|       Frank| Bookshelf|  Furniture|   15000|      300|2023-01-06|
|    110|        Judy|     Chair|  Furniture|       4|   150000|2023-01-10|
|    112|         Leo|Smartwatch|Electronics|       3|    25000|2023-01-12|
|    113|         Leo|     Chair| Furnitures|       3|    25000|2023-01-12|
+-------+------------+----------+-----------+--------+---------+----------+



In [46]:
#Show the average UnitPrice per category.
spark.sql('''
    SELECT Category,
           ROUND(AVG(UnitPrice), 2) AS AvgUnitPrice
    FROM orders_local
    GROUP BY Category
''').show()



+-----------+------------+
|   Category|AvgUnitPrice|
+-----------+------------+
|Electronics|     20250.0|
|   Clothing|    29666.67|
|  Furniture|     54100.0|
|      Books|     47500.0|
| Furnitures|     25000.0|
+-----------+------------+



In [47]:
#Find the order with the highest total amount.
spark.sql('''
    SELECT *,
           Quantity * UnitPrice AS TotalAmount
    FROM orders_local
    ORDER BY TotalAmount DESC
    LIMIT 1
''').show()


+-------+------------+---------+---------+--------+---------+----------+-----------+
|OrderID|CustomerName|  Product| Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+---------+---------+--------+---------+----------+-----------+
|    106|       Frank|Bookshelf|Furniture|   15000|      300|2023-01-06|    4500000|
+-------+------------+---------+---------+--------+---------+----------+-----------+



In [48]:
#Drop the local view and try querying it again.
spark.catalog.dropTempView("orders_local")


True

In [49]:
# Display all "Furniture" orders with TotalAmount > 10,000

spark.sql('''
    SELECT *, Quantity * UnitPrice AS TotalAmount
    FROM global_temp.orders_global
    WHERE Category = 'Furniture' AND (Quantity * UnitPrice) > 10000
''').show()

+-------+------------+---------+---------+--------+---------+----------+-----------+
|OrderID|CustomerName|  Product| Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+---------+---------+--------+---------+----------+-----------+
|    103|     Charlie|     Sofa|Furniture|       2|    12000|2025-07-03|      24000|
|    106|       Frank|Bookshelf|Furniture|   15000|      300|2023-01-06|    4500000|
|    110|        Judy|    Chair|Furniture|       4|   150000|2023-01-10|     600000|
+-------+------------+---------+---------+--------+---------+----------+-----------+



In [50]:
# Add DiscountFlag: "Yes" if Quantity > 3 else "No"

spark.sql('''
    SELECT *,
           CASE
               WHEN Quantity > 3 THEN 'Yes'
               ELSE 'No'
           END AS DiscountFlag
    FROM global_temp.orders_global
''').show()


+-------+------------+----------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+----------+-----------+--------+---------+----------+------------+
|    101|       Alice|Smartphone|Electronics|       1|     6000|2025-07-01|          No|
|    102|         Bob|     Jeans|   Clothing|       2|     4000|2025-07-02|          No|
|    103|     Charlie|      Sofa|  Furniture|       2|    12000|2025-07-03|          No|
|    104|       Daisy|    Laptop|Electronics|       7|    10000|2025-07-04|         Yes|
|    105|         Eve|   T-Shirt|   Clothing|       3|    25000|2025-07-05|          No|
|    106|       Frank| Bookshelf|  Furniture|   15000|      300|2023-01-06|         Yes|
|    107|       Grace|     Novel|      Books|       2|    15000|2025-07-07|          No|
|    108|       Heidi|    Tablet|Electronics|       4|    40000|2025-07-08|         Yes|
|    109|        Ivan

In [51]:
#  List customers who ordered more than 1 product type

spark.sql('''
    SELECT CustomerName, COUNT(DISTINCT Product) AS product_types
    FROM global_temp.orders_global
    GROUP BY CustomerName
    HAVING COUNT(DISTINCT Product) > 1
''').show()

+------------+-------------+
|CustomerName|product_types|
+------------+-------------+
|         Leo|            2|
+------------+-------------+



In [32]:
#Count number of orders per month

spark.sql('''
    SELECT
        SUBSTRING(OrderDate, 1, 7) AS Month,
        COUNT(*) AS OrderCount
    FROM global_temp.orders_global
    GROUP BY SUBSTRING(OrderDate, 1, 7)
    ORDER BY Month
''').show()

+-------+----------+
|  Month|OrderCount|
+-------+----------+
|2025-07|        12|
+-------+----------+



In [52]:
# Rank all products by total quantity sold using a window function

spark.sql('''
    SELECT Product, TotalQuantity,
           RANK() OVER (ORDER BY TotalQuantity DESC) AS Rank
    FROM (
        SELECT Product, SUM(Quantity) AS TotalQuantity
        FROM global_temp.orders_global
        GROUP BY Product
    ) AS summary
''').show()


+----------+-------------+----+
|   Product|TotalQuantity|Rank|
+----------+-------------+----+
| Bookshelf|        15000|   1|
|    Laptop|            7|   2|
|     Chair|            7|   2|
|    Tablet|            4|   4|
|   T-Shirt|            3|   5|
|Smartwatch|            3|   5|
|      Sofa|            2|   7|
|     Jeans|            2|   7|
|     Novel|            2|   7|
|Smartphone|            1|  10|
|  Textbook|            1|  10|
|    Blazer|            1|  10|
+----------+-------------+----+



In [53]:
#  Run a query from a NEW SparkSession using global view

from pyspark.sql import SparkSession

new_spark = SparkSession.builder \
    .appName("QueryGlobalView") \
    .getOrCreate()

# Query global view from the new session
new_spark.sql('''
    SELECT Category, COUNT(*) AS TotalOrders
    FROM global_temp.orders_global
    GROUP BY Category
''').show()


+-----------+-----------+
|   Category|TotalOrders|
+-----------+-----------+
|Electronics|          4|
|   Clothing|          3|
|  Furniture|          3|
|      Books|          2|
| Furnitures|          1|
+-----------+-----------+



# Bonus Challenges
1. Save a filtered subset (only "Books" category) as a new global temp view.
2. Find the most purchased product per category.
3. Create a view that excludes all "Clothing" orders and call it
"filtered_orders" .

In [54]:
# Save a filtered subset (only "Books" category) as a new global temp view.
spark.sql('''
    SELECT *
    FROM global_temp.orders_global
    WHERE Category = 'Books'
''').createOrReplaceGlobalTempView("books_orders")


In [55]:
#Find the most purchased product per category.
spark.sql('''
    SELECT *
    FROM (
        SELECT Category, Product, SUM(Quantity) AS TotalQuantity,
               RANK() OVER (PARTITION BY Category ORDER BY SUM(Quantity) DESC) AS rank
        FROM global_temp.orders_global
        GROUP BY Category, Product
    ) ranked
    WHERE rank = 1
''').show()


+-----------+---------+-------------+----+
|   Category|  Product|TotalQuantity|rank|
+-----------+---------+-------------+----+
|      Books|    Novel|            2|   1|
|   Clothing|  T-Shirt|            3|   1|
|Electronics|   Laptop|            7|   1|
|  Furniture|Bookshelf|        15000|   1|
| Furnitures|    Chair|            3|   1|
+-----------+---------+-------------+----+



In [38]:
# Create a view that excludes all "Clothing" orders and call it "filtered_orders" .
spark.sql('''
    SELECT *
    FROM global_temp.orders_global
    WHERE Category != 'Clothing'
''').createOrReplaceGlobalTempView("filtered_orders")
