In [3]:
!pip install pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Product-Order-Example").getOrCreate()



In [4]:
# Product data
product_data = [
    (101, "Laptop", "Electronics", 55000),
    (102, "Mobile Phone", "Electronics", 25000),
    (103, "Chair", "Furniture", 5000),
    (104, "Book", "Stationery", 300),
    (105, "Headphones", "Electronics", 3000)
]

product_cols = ["product_id", "name", "category", "price"]
product_df = spark.createDataFrame(product_data, product_cols)

# Order data
order_data = [
    (201, 101, 2, "Rahul Sharma"),
    (202, 102, 1, "Priya Singh"),
    (203, 103, 4, "Aman Kumar"),
    (204, 104, 10, "Sneha Reddy"),
    (205, 101, 1, "Arjun Mehta"),
    (206, 105, 3, "Rahul Sharma"),
    (207, 106, 1, "Ghost Customer")  # Order with product not in catalog
]

order_cols = ["order_id", "product_id", "quantity", "customer"]
order_df = spark.createDataFrame(order_data, order_cols)

# Show both
product_df.show()
order_df.show()

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
|       103|       Chair|  Furniture| 5000|
|       104|        Book| Stationery|  300|
|       105|  Headphones|Electronics| 3000|
+----------+------------+-----------+-----+

+--------+----------+--------+--------------+
|order_id|product_id|quantity|      customer|
+--------+----------+--------+--------------+
|     201|       101|       2|  Rahul Sharma|
|     202|       102|       1|   Priya Singh|
|     203|       103|       4|    Aman Kumar|
|     204|       104|      10|   Sneha Reddy|
|     205|       101|       1|   Arjun Mehta|
|     206|       105|       3|  Rahul Sharma|
|     207|       106|       1|Ghost Customer|
+--------+----------+--------+--------------+



**Transformation**

In [8]:
# Select specific columns
product_df.select("name", "price").show()

# Filter products with price › 10,000
product_df.filter(product_df["price"] > 10000).show()

# Order products by price descending
product_df.orderBy(product_df["price"].desc()).show()

+------------+-----+
|        name|price|
+------------+-----+
|      Laptop|55000|
|Mobile Phone|25000|
|       Chair| 5000|
|        Book|  300|
|  Headphones| 3000|
+------------+-----+

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
+----------+------------+-----------+-----+

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
|       103|       Chair|  Furniture| 5000|
|       105|  Headphones|Electronics| 3000|
|       104|        Book| Stationery|  300|
+----------+------------+-----------+-----+



**Aggregation**

In [9]:
# Total quantity ordered per product
order_df.groupBy("product_id").sum("quantity").show()

# Count of orders per customer
order_df.groupBy("customer").count().show()

# Average price per category
product_df.groupBy("category").avg("price").show()

+----------+-------------+
|product_id|sum(quantity)|
+----------+-------------+
|       103|            4|
|       101|            3|
|       102|            1|
|       104|           10|
|       106|            1|
|       105|            3|
+----------+-------------+

+--------------+-----+
|      customer|count|
+--------------+-----+
|    Aman Kumar|    1|
|  Rahul Sharma|    2|
|   Priya Singh|    1|
|   Arjun Mehta|    1|
|Ghost Customer|    1|
|   Sneha Reddy|    1|
+--------------+-----+

+-----------+------------------+
|   category|        avg(price)|
+-----------+------------------+
|Electronics|27666.666666666668|
| Stationery|             300.0|
|  Furniture|            5000.0|
+-----------+------------------+



**Joins**

In [11]:
# Inner Join: Orders with product details
order_df.join(product_df, order_df.product_id == product_df.product_id, "inner").show()


# Left Join: All orders, even if product not found
order_df.join(product_df, order_df.product_id == product_df.product_id, "left").show()


# Right Join: All products, even if never ordered
order_df.join(product_df, order_df.product_id == product_df.product_id, "right").show()

+--------+----------+--------+------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|    customer|product_id|        name|   category|price|
+--------+----------+--------+------------+----------+------------+-----------+-----+
|     201|       101|       2|Rahul Sharma|       101|      Laptop|Electronics|55000|
|     205|       101|       1| Arjun Mehta|       101|      Laptop|Electronics|55000|
|     202|       102|       1| Priya Singh|       102|Mobile Phone|Electronics|25000|
|     203|       103|       4|  Aman Kumar|       103|       Chair|  Furniture| 5000|
|     204|       104|      10| Sneha Reddy|       104|        Book| Stationery|  300|
|     206|       105|       3|Rahul Sharma|       105|  Headphones|Electronics| 3000|
+--------+----------+--------+------------+----------+------------+-----------+-----+

+--------+----------+--------+--------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|      customer|produc

In [15]:
# Register as temp views
product_df.createOrReplaceTempView("products")
order_df.createOrReplaceTempView("orders")

# Query: Total revenue per product
spark.sql("""
SELECT o.product_id, p.name, SUM(o.quantity * p.price) AS total_revenue
FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY o.product_id, p.name
""").show()
# Query: Top 2 customers by total quantity
spark.sql("""
SELECT customer, SUM(quantity) As total_qty
FROM orders
GROUP BY customer
ORDER BY total_qty DESC
LIMIT 2
""").show()

+----------+------------+-------------+
|product_id|        name|total_revenue|
+----------+------------+-------------+
|       101|      Laptop|       165000|
|       102|Mobile Phone|        25000|
|       103|       Chair|        20000|
|       104|        Book|         3000|
|       105|  Headphones|         9000|
+----------+------------+-------------+

+------------+---------+
|    customer|total_qty|
+------------+---------+
| Sneha Reddy|       10|
|Rahul Sharma|        5|
+------------+---------+

