In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Set-3").getOrCreate()
spark

**Scenario 1: Inventory Alerting System**

In [3]:
# 1. Load the data using PySpark.
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv('/content/drive/MyDrive/inventory_supply.csv',header = True,inferSchema = True)
df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|
+------+------------+-----------+----------+--------+----------

In [5]:
# 2. Create a new column NeedsReorder = StockQty < ReorderLevel .
from pyspark.sql.functions import col

df = df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|        true|
+------+------------+-----------+----------+--------+------------+-------------+--------

In [6]:
# 3. Create a view of all items that need restocking.

df_needs_reorder = df.filter(col("NeedsReorder") == True)
df_needs_reorder.createOrReplaceTempView("items_needing_reorder")

In [7]:
# 4. Highlight warehouses with more than 2 such items.
from pyspark.sql.functions import count

warehouse_summary = df_needs_reorder.groupBy("Warehouse").agg(count("*").alias("NumItemsNeedingReorder")).filter(col("NumItemsNeedingReorder") > 2)
warehouse_summary.show()

+---------+----------------------+
|Warehouse|NumItemsNeedingReorder|
+---------+----------------------+
+---------+----------------------+



**Scenario 2: Supplier Price Optimization**

In [10]:
# 1. Group items by Supplier and compute average price.
from pyspark.sql.functions import avg

category_avg = df.groupBy("Category").agg(avg("UnitPrice").alias("AvgPriceByCategory"))
category_avg.show()

+-----------+------------------+
|   Category|AvgPriceByCategory|
+-----------+------------------+
|Electronics|           36000.0|
|  Furniture|            6000.0|
| Appliances|           25000.0|
+-----------+------------------+



In [12]:
# 2. Find which suppliers offer items below average price in their category.
df_with_avg = df.join(category_avg, on="Category", how="left")

df_below_avg = df_with_avg.filter(df_with_avg["UnitPrice"] < df_with_avg["AvgPriceByCategory"])
df_below_avg.show()

+-----------+------+--------+----------+--------+------------+-------------+---------+---------+------------+------------------+
|   Category|ItemID|ItemName| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|AvgPriceByCategory|
+-----------+------+--------+----------+--------+------------+-------------+---------+---------+------------+------------------+
|Electronics|  I001|  LED TV|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|           36000.0|
|Electronics|  I005| Printer|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|        true|           36000.0|
+-----------+------+--------+----------+--------+------------+-------------+---------+---------+------------+------------------+



In [13]:
# 3. Tag suppliers with Good Deal if >50% of their items are below market average.
from pyspark.sql.functions import count, when, lit

total_items = df.groupBy("Supplier").agg(count("*").alias("TotalItems"))
below_avg_items = df_below_avg.groupBy("Supplier").agg(count("*").alias("BelowAvgItems"))

supplier_deals = total_items.join(below_avg_items, on="Supplier", how="left").fillna(0, subset=["BelowAvgItems"])\
.withColumn("GoodDeal", (col("BelowAvgItems") / col("TotalItems")) > 0.5)

supplier_deals.show()

+---------+----------+-------------+--------+
| Supplier|TotalItems|BelowAvgItems|GoodDeal|
+---------+----------+-------------+--------+
|   AVTech|         1|            1|    true|
|TechWorld|         1|            0|   false|
|PrintFast|         1|            1|    true|
| FreezeIt|         1|            0|   false|
|  ChairCo|         1|            0|   false|
+---------+----------+-------------+--------+



**Scenario 3: Cost Forecasting**

In [14]:
# 1. Calculate TotalStockValue = StockQty * UnitPrice .
from pyspark.sql.functions import col

df = df.withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|         125000|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|

In [15]:
# 2. Identify top 3 highest-value items.

top_3_items = df.orderBy(col("TotalStockValue").desc()).limit(3)
top_3_items.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+



In [16]:
# 3. Export the result as a Parquet file partitioned by Warehouse .

top_3_items.write.mode("overwrite").partitionBy("Warehouse").parquet("/content/drive/MyDrive/top3_high_value_items")

**Scenario 4: Warehouse Utilization**

In [17]:
# 1. Count items stored per warehouse.
from pyspark.sql.functions import countDistinct

items_per_warehouse = df.groupBy("Warehouse").agg(countDistinct("ItemID").alias("NumItems"))
items_per_warehouse.show()

+----------+--------+
| Warehouse|NumItems|
+----------+--------+
|WarehouseA|       2|
|WarehouseC|       1|
|WarehouseB|       2|
+----------+--------+



In [18]:
# 2. Average stock per category in each warehouse.
from pyspark.sql.functions import avg

avg_stock = df.groupBy("Warehouse", "Category").agg(avg("StockQty").alias("AvgStockPerCategory"))
avg_stock.show()

+----------+-----------+-------------------+
| Warehouse|   Category|AvgStockPerCategory|
+----------+-----------+-------------------+
|WarehouseB|Electronics|                6.5|
|WarehouseA|  Furniture|               40.0|
|WarehouseC| Appliances|                5.0|
|WarehouseA|Electronics|               50.0|
+----------+-----------+-------------------+



In [19]:
# 3. Determine underutilized warehouses ( total stock < 100 ).
from pyspark.sql.functions import sum

warehouse_stock = df.groupBy("Warehouse").agg(sum("StockQty").alias("TotalStock"))

underutilized = warehouse_stock.filter(col("TotalStock") < 100)
underutilized.show()

+----------+----------+
| Warehouse|TotalStock|
+----------+----------+
|WarehouseA|        90|
|WarehouseC|         5|
|WarehouseB|        13|
+----------+----------+



**Scenario 5: Delta Audit Trail**

In [None]:
# 1. Save as Delta table retail_inventory .
# 2. Update stock of 'Laptop' to 20.
# 3. Delete any item with StockQty = 0 .
# 4. Run DESCRIBE HISTORY and query VERSION AS OF previous state.

df.write.format("delta").mode("overwrite").save("/delta/retail_inventory")

spark.sql("DROP TABLE IF EXISTS retail_inventory")
spark.sql("CREATE TABLE retail_inventory USING DELTA LOCATION '/delta/retail_inventory'")

from delta.tables import DeltaTable
from pyspark.sql.functions import col

delta_table = DeltaTable.forPath(spark, "/delta/retail_inventory")

delta_table.update(
    condition = col("ItemName") == "Laptop",
    set = { "StockQty": lit(20) }
)

delta_table.delete(col("StockQty") == 0)

spark.sql("DESCRIBE HISTORY retail_inventory").show(truncate=False)

**Scenario 6: Alerts from Restock Logs (Join Task)**

In [20]:
# 1. Join with inventory table to update StockQty.
restock_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/drive/MyDrive/restock_logs.csv")

from pyspark.sql.functions import to_date
restock_df = restock_df.withColumn("RestockDate", to_date("RestockDate", "yyyy-MM-dd"))
restock_df.show()

+------+-----------+-------------+
|ItemID|RestockDate|QuantityAdded|
+------+-----------+-------------+
|  I002| 2024-04-20|           10|
|  I005| 2024-04-22|            5|
|  I001| 2024-04-25|           20|
+------+-----------+-------------+



In [None]:
# 2. Calculate new stock and flag RestockedRecently = true for updated items.
# Join with base Delta inventory
inventory_df = spark.read.format("delta").load("/delta/retail_inventory")

restocked_df = restock_df.join(inventory_df, on="ItemID", how="inner") \
    .withColumn("NewStockQty", col("StockQty") + col("QuantityAdded")) \
    .withColumn("RestockedRecently", lit(True)) \
    .select("ItemID", "NewStockQty", "RestockedRecently")

In [None]:
# 3. Use MERGE INTO to update in Delta.
from delta.tables import DeltaTable
from pyspark.sql.functions import lit

delta_table = DeltaTable.forPath(spark, "/delta/retail_inventory")

delta_table.alias("tgt").merge(
    restocked_df.alias("src"),
    "tgt.ItemID = src.ItemID"
).whenMatchedUpdate(set={
    "StockQty": col("src.NewStockQty"),
    "RestockedRecently": col("src.RestockedRecently")
}).execute()

**Scenario 7: Report Generation with SQL Views**

In [22]:
# 1. Create SQL view inventory_summary with:
# ItemName, Category, StockQty, NeedsReorder, TotalStockValue
from pyspark.sql.functions import col

df = df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel")).withColumn("TotalStockValue", col("StockQty") * col("UnitPrice"))
df.createOrReplaceTempView("inventory_summary")

In [24]:
# 2. Create view supplier_leaderboard sorted by average price
spark.sql("""
    CREATE OR REPLACE TEMP VIEW supplier_leaderboard AS
    SELECT Supplier, ROUND(AVG(UnitPrice), 2) AS AvgUnitPrice
    FROM inventory_summary
    GROUP BY Supplier
    ORDER BY AvgUnitPrice ASC
""")

spark.sql("SELECT * FROM supplier_leaderboard").show()

+---------+------------+
| Supplier|AvgUnitPrice|
+---------+------------+
|  ChairCo|      6000.0|
|PrintFast|      8000.0|
| FreezeIt|     25000.0|
|   AVTech|     30000.0|
|TechWorld|     70000.0|
+---------+------------+



**Scenario 8: Advanced Filtering**

In [25]:
# 1. Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
from pyspark.sql.functions import when, col

df = df.withColumn("StockStatus",when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked").otherwise("LowStock"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockStatus|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|Overstocked|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|   LowStock|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|Overstocked|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|         125000|   LowStock|
|  I00

In [26]:
# 2. Use .filter() and .where() for the same and compare.

overstocked_filter = df.filter(col("StockStatus") == "Overstocked")
overstocked_filter.show()

overstocked_where = df.where("StockStatus = 'Overstocked'")
overstocked_where.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|Supplier|NeedsReorder|TotalStockValue|StockStatus|
+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|  AVTech|       false|        1500000|Overstocked|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000| ChairCo|       false|         240000|Overstocked|
+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+

+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+
|ItemID|    

**Scenario 9: Feature Engineering**

In [28]:
# 1. Extract RestockMonth from LastRestocked .
from pyspark.sql.functions import month

df = df.withColumn("RestockMonth", month("LastRestocked"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockStatus|RestockMonth|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|Overstocked|           3|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|   LowStock|           4|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|Overstocked|           3|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2

In [29]:
# 2. Create feature: StockAge = CURRENT_DATE - LastRestocked
from pyspark.sql.functions import current_date, datediff

df = df.withColumn("StockAge", datediff(current_date(), "LastRestocked"))
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+--------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockStatus|RestockMonth|StockAge|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+--------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|Overstocked|           3|     461|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|   LowStock|           4|     444|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|Overstocked|           3|     451|
|  I004|Refriger

In [30]:
# 3. Bucket StockAge into: New, Moderate, Stale
from pyspark.sql.functions import when

df = df.withColumn(
    "StockAgeBucket",
    when(col("StockAge") <= 30, "New")
    .when(col("StockAge") <= 90, "Moderate")
    .otherwise("Stale")
)
df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+--------+--------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockStatus|RestockMonth|StockAge|StockAgeBucket|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+--------+--------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|Overstocked|           3|     461|         Stale|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|   LowStock|           4|     444|         Stale|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|     

**Scenario 10: Export Options**

In [None]:
# Export full DataFrame to CSV for analysts
df.write.mode("overwrite").option("header", True).csv("/export/inventory/full_dataset/csv/")

# Export to JSON for integration needs
df.write.mode("overwrite").json("/export/inventory/full_dataset/json/")

# Export as Delta for downstream pipelines
df.write.format("delta").mode("overwrite").save("/export/inventory/full_dataset/delta/")