In [2]:
# Create CSV file in Colab
csv_data = """product_id,product_name,category,price,quantity
101,Laptop,Electronics,55000,10
102,Smartphone,Electronics,30000,25
103,Chair,Furniture,2500,50
104,Book,Stationery,400,200
105,Headphones,Electronics,1500,100
106,Table,Furniture,3200,40
107,Pen,Stationery,20,500
108,Monitor,Electronics,12000,15
109,Notebook,Stationery,60,300
110,Sofa,Furniture,45000,5
"""

with open("products.csv", "w") as f:
    f.write(csv_data)

print("CSV file 'products.csv' created successfully.")


CSV file 'products.csv' created successfully.


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, expr

# Start Spark session
spark = SparkSession.builder.appName("ProductAnalysis").getOrCreate()


1. Read CSV and Print Schema

In [4]:
csv_df = spark.read.option("header", True).option("inferSchema", True).csv("products.csv")
csv_df.printSchema()


root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- quantity: integer (nullable = true)



In [5]:
# Create JSON data
json_data = [
  {"product_id": 101, "product_name": "Laptop", "category": "Electronics", "price": 55000, "quantity": 10},
  {"product_id": 102, "product_name": "Smartphone", "category": "Electronics", "price": 30000, "quantity": 25},
  {"product_id": 103, "product_name": "Chair", "category": "Furniture", "price": 2500, "quantity": 50},
  {"product_id": 104, "product_name": "Book", "category": "Stationery", "price": 400, "quantity": 200},
  {"product_id": 105, "product_name": "Headphones", "category": "Electronics", "price": 1500, "quantity": 100},
  {"product_id": 106, "product_name": "Table", "category": "Furniture", "price": 3200, "quantity": 40},
  {"product_id": 107, "product_name": "Pen", "category": "Stationery", "price": 20, "quantity": 500},
  {"product_id": 108, "product_name": "Monitor", "category": "Electronics", "price": 12000, "quantity": 15},
  {"product_id": 109, "product_name": "Notebook", "category": "Stationery", "price": 60, "quantity": 300},
  {"product_id": 110, "product_name": "Sofa", "category": "Furniture", "price": 45000, "quantity": 5}
]

# Convert list of dicts to Spark DataFrame and write as JSON
df = spark.createDataFrame(json_data)
df.write.mode("overwrite").json("/FileStore/tables/products.json")


 2. Read JSON and Compare Schema

In [6]:
json_df = spark.read.option("multiline", True).json("/FileStore/tables/products.json")
json_df.printSchema()


root
 |-- category: string (nullable = true)
 |-- price: long (nullable = true)
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- quantity: long (nullable = true)



4. Convert CSV to Parquet



In [8]:
csv_df.write.mode("overwrite").parquet("products_parquet")


4. Compare File Sizes



In [9]:
import os

def get_size(path):
    return sum(os.path.getsize(os.path.join(dirpath, f))
               for dirpath, _, files in os.walk(path)
               for f in files)

csv_size = get_size("products.csv")
json_size = get_size("products.json")
parquet_size = get_size("products_parquet")

print(f"CSV size: {csv_size} bytes")
print(f"JSON size: {json_size} bytes")
print(f"Parquet size: {parquet_size} bytes")


CSV size: 0 bytes
JSON size: 0 bytes
Parquet size: 1766 bytes


4. Add total_revenue = price * quantity

In [10]:


df = csv_df.withColumn("total_revenue", col("price") * col("quantity"))
df.show()

+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       101|      Laptop|Electronics|55000|      10|       550000|
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       103|       Chair|  Furniture| 2500|      50|       125000|
|       104|        Book| Stationery|  400|     200|        80000|
|       105|  Headphones|Electronics| 1500|     100|       150000|
|       106|       Table|  Furniture| 3200|      40|       128000|
|       107|         Pen| Stationery|   20|     500|        10000|
|       108|     Monitor|Electronics|12000|      15|       180000|
|       109|    Notebook| Stationery|   60|     300|        18000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+



6. Top 3 Products by Revenue

In [11]:
df.orderBy(col("total_revenue").desc()).show(3)


+----------+------------+-----------+-----+--------+-------------+
|product_id|product_name|   category|price|quantity|total_revenue|
+----------+------------+-----------+-----+--------+-------------+
|       102|  Smartphone|Electronics|30000|      25|       750000|
|       101|      Laptop|Electronics|55000|      10|       550000|
|       110|        Sofa|  Furniture|45000|       5|       225000|
+----------+------------+-----------+-----+--------+-------------+
only showing top 3 rows



7. Furniture Products with Price > 3000


In [12]:
df.filter((col("category") == "Furniture") & (col("price") > 3000)).show()

+----------+------------+---------+-----+--------+-------------+
|product_id|product_name| category|price|quantity|total_revenue|
+----------+------------+---------+-----+--------+-------------+
|       106|       Table|Furniture| 3200|      40|       128000|
|       110|        Sofa|Furniture|45000|       5|       225000|
+----------+------------+---------+-----+--------+-------------+



8. Add price_band Column

In [13]:
df = df.withColumn("price_band", when(col("price") > 10000, "High")
                                  .when((col("price") > 3000) & (col("price") <= 10000), "Medium")
                                  .otherwise("Low"))
df.select("product_name", "price", "price_band").show()

+------------+-----+----------+
|product_name|price|price_band|
+------------+-----+----------+
|      Laptop|55000|      High|
|  Smartphone|30000|      High|
|       Chair| 2500|       Low|
|        Book|  400|       Low|
|  Headphones| 1500|       Low|
|       Table| 3200|    Medium|
|         Pen|   20|       Low|
|     Monitor|12000|      High|
|    Notebook|   60|       Low|
|        Sofa|45000|      High|
+------------+-----+----------+



9. Group by Category - Total Quantity Sold

In [14]:
df.groupBy("category").sum("quantity").withColumnRenamed("sum(quantity)", "total_quantity").show()

+-----------+--------------+
|   category|total_quantity|
+-----------+--------------+
| Stationery|          1000|
|Electronics|           150|
|  Furniture|            95|
+-----------+--------------+



10. Average Price by Category

In [15]:
df.groupBy("category").avg("price").withColumnRenamed("avg(price)", "avg_price").show()

+-----------+---------+
|   category|avg_price|
+-----------+---------+
| Stationery|    160.0|
|Electronics|  24625.0|
|  Furniture|  16900.0|
+-----------+---------+



11. Count Products by Price Band

In [16]:
df.groupBy("price_band").count().show()

+----------+-----+
|price_band|count|
+----------+-----+
|      High|    4|
|       Low|    5|
|    Medium|    1|
+----------+-----+



12. Filtered Electronics (price > 5000) to Parquet

In [17]:
df.filter((col("category") == "Electronics") & (col("price") > 5000)) \
  .write.mode("overwrite").parquet("filtered_electronics.parquet")

13. Stationery Products to JSON

In [None]:
df.filter(col("category") == "Stationery") \
  .write.mode("overwrite").json("stationery_products.json")

14. Load Parquet & Find Category with Highest Revenue


In [None]:
parquet_df = spark.read.parquet("products_parquet")
parquet_df = parquet_df.withColumn("total_revenue", col("price") * col("quantity"))

parquet_df.groupBy("category").sum("total_revenue") \
  .withColumnRenamed("sum(total_revenue)", "category_revenue") \
  .orderBy(col("category_revenue").desc()).show(1)

15. BONUS: Temp View + SQL for Quantity > 100 and Price < 1000

In [18]:
df.createOrReplaceTempView("products")
spark.sql("""
    SELECT * FROM products
    WHERE quantity > 100 AND price < 1000
""").show()


+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|        80000|       Low|
|       107|         Pen|Stationery|   20|     500|        10000|       Low|
|       109|    Notebook|Stationery|   60|     300|        18000|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

