In [0]:
data = [
    (101, "Laptop", "Electronics", 55000, 10),
    (102, "Smartphone", "Electronics", 30000, 25),
    (103, "Chair", "Furniture", 2500, 50),
    (104, "Book", "Stationery", 400, 200),
    (105, "Headphones", "Electronics", 1500, 100),
    (106, "Table", "Furniture", 3200, 40),
    (107, "Pen", "Stationery", 20, 500),
    (108, "Monitor", "Electronics", 12000, 15),
    (109, "Notebook", "Stationery", 60, 300),
    (110, "Sofa", "Furniture", 45000, 5)
]
columns=["product_id", "product_name", "category", "price", "quantity"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       101|      Laptop|Electronics|55000|      10|
|       102|  Smartphone|Electronics|30000|      25|
|       103|       Chair|  Furniture| 2500|      50|
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
|       108|     Monitor|Electronics|12000|      15|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
+----------+------------+-----------+-----+--------+



1. Read the above data from CSV into a DataFrame and print the schema.

In [0]:
df.write.mode("overwrite").option("header", True).csv("/tmp/products_csv")

df_csv=spark.read.csv("/tmp/products_csv", header=True)
df_csv.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
|       102|  Smartphone|Electronics|30000|      25|
|       108|     Monitor|Electronics|12000|      15|
|       101|      Laptop|Electronics|55000|      10|
|       103|       Chair|  Furniture| 2500|      50|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
+----------+------------+-----------+-----+--------+



2. Read the same data from JSON and compare with the CSV schema. Any differences?

In [0]:
df.write.mode("overwrite").option("header", "true").json("/tmp/products_json")

df_json=spark.read.json("/tmp/products_json")
df_json.show()

+-----------+-----+----------+------------+--------+
|   category|price|product_id|product_name|quantity|
+-----------+-----+----------+------------+--------+
| Stationery|  400|       104|        Book|     200|
|Electronics| 1500|       105|  Headphones|     100|
| Stationery|   60|       109|    Notebook|     300|
|  Furniture|45000|       110|        Sofa|       5|
|Electronics|30000|       102|  Smartphone|      25|
|Electronics|12000|       108|     Monitor|      15|
|Electronics|55000|       101|      Laptop|      10|
|  Furniture| 2500|       103|       Chair|      50|
|  Furniture| 3200|       106|       Table|      40|
| Stationery|   20|       107|         Pen|     500|
+-----------+-----+----------+------------+--------+



3. Convert the CSV data into Parquet format and save to disk.


In [0]:
df.write.mode("overwrite").option("header", "true").parquet("/tmp/products_parquet")

df_parquet=spark.read.parquet("/tmp/products_parquet")
df_parquet.show()

+----------+------------+-----------+-----+--------+
|product_id|product_name|   category|price|quantity|
+----------+------------+-----------+-----+--------+
|       104|        Book| Stationery|  400|     200|
|       105|  Headphones|Electronics| 1500|     100|
|       109|    Notebook| Stationery|   60|     300|
|       110|        Sofa|  Furniture|45000|       5|
|       102|  Smartphone|Electronics|30000|      25|
|       108|     Monitor|Electronics|12000|      15|
|       101|      Laptop|Electronics|55000|      10|
|       103|       Chair|  Furniture| 2500|      50|
|       106|       Table|  Furniture| 3200|      40|
|       107|         Pen| Stationery|   20|     500|
+----------+------------+-----------+-----+--------+




4. Measure the size of CSV vs JSON vs Parquet on disk. Which one is smallest?

In [0]:
import os

def get_dir_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

csv_size = get_dir_size('/tmp/products_csv')
json_size = get_dir_size('/tmp/products_json')
parquet_size = get_dir_size('/tmp/products_parquet')

print('CSV:', csv_size, 'bytes')
print('JSON:', json_size, 'bytes')
print('Parquet:', parquet_size, 'bytes')


CSV: 0 bytes
JSON: 0 bytes
Parquet: 0 bytes


5. Add a column total_revenue = price * quantity for each record.


In [0]:
df=df.withColumn("total_revenue",df["price"]*df["quantity"])
df.show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



6. Find the top 3 products with the highest total revenue.


In [0]:
df.orderBy(df["total_revenue"].desc()).limit(3).show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



7. Filter and display only Furniture products with price > 3000.

In [0]:
df.filter((df["category"] == "Furniture") & (df["price"] >3000)).show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       106|       Table|Furniture| 3200|      40|       128000|
|       110|        Sofa|Furniture|45000|       5|       225000|
+----------+------------+---------+-----+--------+-------------+



8. Create a new column price_band with values:
'High' if price > 10000
'Medium' if 3000 < price <= 10000
'Low' if price ≤ 3000

In [0]:
import pyspark.sql.functions as F
df = df.withColumn(
    "price_band",
    F.when(F.col("price") > 10000, "High")
    .when(F.col("price") > 3000, "Medium")
    .otherwise("Low")
)
df.show()

+----------+------------+-----------+-----+--------+-------------+----------+
|product_id|product_name|   category|price|quantity|total_revenue|price_band|
+----------+------------+-----------+-----+--------+-------------+----------+
|       101|      Laptop|Electronics|55000|      10|       550000|      High|
|       102|  Smartphone|Electronics|30000|      25|       750000|      High|
|       103|       Chair|  Furniture| 2500|      50|       125000|       Low|
|       104|        Book| Stationery|  400|     200|        80000|       Low|
|       105|  Headphones|Electronics| 1500|     100|       150000|       Low|
|       106|       Table|  Furniture| 3200|      40|       128000|    Medium|
|       107|         Pen| Stationery|   20|     500|        10000|       Low|
|       108|     Monitor|Electronics|12000|      15|       180000|      High|
|       109|    Notebook| Stationery|   60|     300|        18000|       Low|
|       110|        Sofa|  Furniture|45000|       5|       22500

9. Group by category and calculate total quantity sold.

In [0]:
df.groupby("category").agg({"quantity": "sum"}).show()

+-----------+-------------+
|   category|sum(quantity)|
+-----------+-------------+
|Electronics|          150|
|  Furniture|           95|
| Stationery|         1000|
+-----------+-------------+



10. Calculate average price of products for each category.

In [0]:
df.groupby("category").agg({"price": "avg"}).show()

+-----------+----------+
|   category|avg(price)|
+-----------+----------+
|Electronics|   24625.0|
|  Furniture|   16900.0|
| Stationery|     160.0|
+-----------+----------+



11. Count how many products fall in each price_band .


In [0]:
df.groupby("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|      High|    4|
|       Low|    5|
|    Medium|    1|
+----------+-----+



12. Write the filtered Electronics products (price > 5000) into a Parquet file.


In [0]:
df.filter((df["category"]=="Electronics") & (df["price"]>5000)).write.mode("overwrite").option("header", "true").parquet("/tmp/Electronics_products_parquet")


Write the Stationery products into a JSON file.


In [0]:
df.filter(df["category"] == "stationary").write.mode("overwrite").option("header", "true").json("/tmp/Stationary_products_json")

Load Parquet back and run a query to find which category has highest total
revenue.

In [0]:
df_parquet = df_parquet.withColumn("total_revenue", F.col("price") * F.col("quantity"))
df_grouped=df.groupby("category").agg(F.sum(F.col('total_revenue')))
df_grouped.orderBy(df_grouped["sum(total_revenue)"].desc()).limit(1).show()


+-----------+------------------+
|   category|sum(total_revenue)|
+-----------+------------------+
|Electronics|           1630000|
+-----------+------------------+



Create a temporary view from the DataFrame and run Spark SQL to find all
products with quantity > 100 and price < 1000.


In [0]:
df.createOrReplaceTempView("products")
spark.sql("select * from products where quantity>100 and price<1000").show()

+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|        80000|       Low|
|       107|         Pen|Stationery|   20|     500|        10000|       Low|
|       109|    Notebook|Stationery|   60|     300|        18000|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

