In [0]:
df = spark.read.option("header", True).csv("/Volumes/retail_catalog/retail_schema/retail_volume/Retail_Transactions_Dataset.csv")
display(df.limit(5))


Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Payment_Method,City,Store_Type,Discount_Applied,Customer_Category,Season,Promotion
1000000000,2022-01-21 06:27:29,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,
1000000001,2023-03-01 13:01:21,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'Potatoes']",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One)
1000000002,2024-03-21 15:37:04,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,
1000000003,2020-10-31 09:59:47,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,
1000000004,2020-12-10 00:59:59,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items


In [0]:
# Replace with your actual catalog/schema/volume names
file_path = "/Volumes/retail_catalog/retail_schema/retail_volume/Retail_Transactions_Dataset.csv"

# Load CSV into Spark DataFrame
df = spark.read.option("header", True).option("inferSchema", True).csv(file_path)

# Show first few rows
display(df.limit(5))

# Check schema
df.printSchema()


Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Payment_Method,City,Store_Type,Discount_Applied,Customer_Category,Season,Promotion
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'Potatoes']",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One)
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,
1000000004,2020-12-10T00:59:59.000Z,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items


root
 |-- Transaction_ID: integer (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Total_Items: integer (nullable = true)
 |-- Total_Cost: double (nullable = true)
 |-- Payment_Method: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Store_Type: string (nullable = true)
 |-- Discount_Applied: boolean (nullable = true)
 |-- Customer_Category: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Promotion: string (nullable = true)



In [0]:

from pyspark.sql import functions as F, types as T

# 1) Convert Date to timestamp
df = df.withColumn("event_ts", F.to_timestamp(F.col("Date"), "dd-MM-yyyy HH:mm")) \
       .withColumn("event_date", F.to_date("event_ts"))
display(df.limit(5))

Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,Payment_Method,City,Store_Type,Discount_Applied,Customer_Category,Season,Promotion,event_ts,event_date
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'Potatoes']",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,,2024-03-21T15:37:04.000Z,2024-03-21
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,,2020-10-31T09:59:47.000Z,2020-10-31
1000000004,2020-12-10T00:59:59.000Z,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items,2020-12-10T00:59:59.000Z,2020-12-10


In [0]:

# 2) Normalize text fields
df = df.withColumn("city", F.initcap("City")) \
       .withColumn("store_type", F.initcap("Store_Type")) \
       .withColumn("payment_method", F.initcap("Payment_Method"))
display(df.limit(5))

Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,payment_method,city,store_type,Discount_Applied,Customer_Category,Season,Promotion,event_ts,event_date
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'Potatoes']",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,,2024-03-21T15:37:04.000Z,2024-03-21
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,,2020-10-31T09:59:47.000Z,2020-10-31
1000000004,2020-12-10T00:59:59.000Z,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items,2020-12-10T00:59:59.000Z,2020-12-10


In [0]:

# 3) Parse Product list (convert single quotes to double quotes, then parse JSON)
df = df.withColumn("products_json", F.regexp_replace(F.col("Product"), "'", '"')) \
       .withColumn("products", F.from_json("products_json", T.ArrayType(T.StringType()))) \
       .drop("products_json")
display(df.limit(5))

Transaction_ID,Date,Customer_Name,Product,Total_Items,Total_Cost,payment_method,city,store_type,Discount_Applied,Customer_Category,Season,Promotion,event_ts,event_date,products
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,"['Ketchup', 'Shaving Cream', 'Light Bulbs']",3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)"
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,"['Ice Cream', 'Milk', 'Olive Oil', 'Bread', 'Potatoes']",2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)"
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,['Spinach'],6,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,,2024-03-21T15:37:04.000Z,2024-03-21,List(Spinach)
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,"['Tissues', 'Mustard']",1,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,,2020-10-31T09:59:47.000Z,2020-10-31,"List(Tissues, Mustard)"
1000000004,2020-12-10T00:59:59.000Z,Susan Mitchell,['Dish Soap'],10,16.42,Debit Card,Houston,Specialty Store,False,Young Adult,Winter,Discount on Selected Items,2020-12-10T00:59:59.000Z,2020-12-10,List(Dish Soap)


In [0]:

# 4) Explode products into individual rows
df_items = df.withColumn("product", F.explode_outer("products"))

display(df_items.limit(5))


Transaction_ID,Date,Customer_Name,product,Total_Items,Total_Cost,payment_method,city,store_type,Discount_Applied,Customer_Category,Season,Promotion,event_ts,event_date,products
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Ketchup,3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)"
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Shaving Cream,3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)"
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Light Bulbs,3,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)"
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Ice Cream,2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)"
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Milk,2,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)"


In [0]:
from pyspark.sql import functions as F

# Recompute using product count from the parsed array
df_items = (df_items
    .withColumn("computed_total_items", F.size("products"))
    # Each product in the list is 1 unit
    .withColumn("item_qty_est", F.when(F.col("computed_total_items") > 0, F.lit(1.0)).otherwise(F.lit(None)))
    # Split the basket total_cost equally across the products
    .withColumn("item_revenue_est",
                F.when(F.col("computed_total_items") > 0,
                       F.col("Total_Cost") / F.col("computed_total_items"))
                 .otherwise(F.lit(None)))
)


df_items = df_items.drop("Total_Items")

display(df_items.limit(10))




Transaction_ID,Date,Customer_Name,product,Total_Cost,payment_method,city,store_type,Discount_Applied,Customer_Category,Season,Promotion,event_ts,event_date,products,computed_total_items,item_qty_est,item_revenue_est
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Ketchup,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Shaving Cream,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Light Bulbs,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,Winter,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Ice Cream,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Milk,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Olive Oil,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Bread,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Potatoes,25.93,Cash,San Francisco,Specialty Store,True,Professional,Fall,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,Spinach,41.49,Credit Card,Houston,Department Store,True,Professional,Winter,,2024-03-21T15:37:04.000Z,2024-03-21,List(Spinach),1,1.0,41.49
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,Tissues,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,Spring,,2020-10-31T09:59:47.000Z,2020-10-31,"List(Tissues, Mustard)",2,1.0,19.67


In [0]:
from pyspark.sql import functions as F

# Add boolean and computed season, plus a simple mismatch flag for sanity check
df_items = (
    df_items
    .withColumn(
        "discount_bool",
        F.lower(F.col("Discount_Applied").cast("string")).isin("true","t","yes","y","1").cast("boolean")
    )
    .withColumn(
        "season_computed",
        F.when(F.month("event_date").isin(12,1,2), "Winter")
         .when(F.month("event_date").isin(3,4,5),  "Spring")
         .when(F.month("event_date").isin(6,7,8),  "Summer")
         .otherwise("Fall")
    )
    .withColumn(
        "season_mismatch_flag",
        F.when(F.col("Season").isNotNull() & (F.col("Season") != F.col("season_computed")), F.lit(1)).otherwise(F.lit(0))
    )
)

# Quick look
display(
  df_items.select(
    "Transaction_ID","event_date","product",
    "Discount_Applied","discount_bool",
    "Season","season_computed","season_mismatch_flag"
  ).limit(20)
)

# Simple check: how many rows have Season different from computed season
mismatch_count = df_items.agg(F.sum("season_mismatch_flag").alias("mismatch_rows")).first()["mismatch_rows"]
print("Season mismatches:", mismatch_count)



Transaction_ID,event_date,product,Discount_Applied,discount_bool,Season,season_computed,season_mismatch_flag
1000000000,2022-01-21,Ketchup,True,True,Winter,Winter,0
1000000000,2022-01-21,Shaving Cream,True,True,Winter,Winter,0
1000000000,2022-01-21,Light Bulbs,True,True,Winter,Winter,0
1000000001,2023-03-01,Ice Cream,True,True,Fall,Spring,1
1000000001,2023-03-01,Milk,True,True,Fall,Spring,1
1000000001,2023-03-01,Olive Oil,True,True,Fall,Spring,1
1000000001,2023-03-01,Bread,True,True,Fall,Spring,1
1000000001,2023-03-01,Potatoes,True,True,Fall,Spring,1
1000000002,2024-03-21,Spinach,True,True,Winter,Spring,1
1000000003,2020-10-31,Tissues,True,True,Spring,Fall,1


Season mismatches: 2250730


In [0]:
df_items = df_items.drop("Season")

In [0]:
df_items = df_items.drop("season_mismatch_flag")

In [0]:
display(df_items.limit(10))

Transaction_ID,Date,Customer_Name,product,Total_Cost,payment_method,city,store_type,Discount_Applied,Customer_Category,Promotion,event_ts,event_date,products,computed_total_items,item_qty_est,item_revenue_est,discount_bool,season_computed
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Ketchup,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336,True,Winter
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Shaving Cream,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336,True,Winter
1000000000,2022-01-21T06:27:29.000Z,Stacey Price,Light Bulbs,71.65,Mobile Payment,Los Angeles,Warehouse Club,True,Homemaker,,2022-01-21T06:27:29.000Z,2022-01-21,"List(Ketchup, Shaving Cream, Light Bulbs)",3,1.0,23.883333333333336,True,Winter
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Ice Cream,25.93,Cash,San Francisco,Specialty Store,True,Professional,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186,True,Spring
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Milk,25.93,Cash,San Francisco,Specialty Store,True,Professional,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186,True,Spring
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Olive Oil,25.93,Cash,San Francisco,Specialty Store,True,Professional,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186,True,Spring
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Bread,25.93,Cash,San Francisco,Specialty Store,True,Professional,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186,True,Spring
1000000001,2023-03-01T13:01:21.000Z,Michelle Carlson,Potatoes,25.93,Cash,San Francisco,Specialty Store,True,Professional,BOGO (Buy One Get One),2023-03-01T13:01:21.000Z,2023-03-01,"List(Ice Cream, Milk, Olive Oil, Bread, Potatoes)",5,1.0,5.186,True,Spring
1000000002,2024-03-21T15:37:04.000Z,Lisa Graves,Spinach,41.49,Credit Card,Houston,Department Store,True,Professional,,2024-03-21T15:37:04.000Z,2024-03-21,List(Spinach),1,1.0,41.49,True,Spring
1000000003,2020-10-31T09:59:47.000Z,Mrs. Patricia May,Tissues,39.34,Mobile Payment,Chicago,Pharmacy,True,Homemaker,,2020-10-31T09:59:47.000Z,2020-10-31,"List(Tissues, Mustard)",2,1.0,19.67,True,Fall


In [0]:
from pyspark.sql import functions as F

# Curate columns & types for a clean Silver schema
df_silver = (
    df_items.select(
        F.col("Transaction_ID").cast("long").alias("transaction_id"),
        F.col("event_ts").cast("timestamp").alias("event_ts"),
        F.col("event_date").cast("date").alias("event_date"),
        F.col("Customer_Name").alias("customer_name"),
        F.col("product"),
        F.col("products"),
        F.col("Total_Cost").cast("double").alias("total_cost"),
        F.col("payment_method"),
        F.col("city"),
        F.col("store_type"),
        F.col("Discount_Applied").alias("discount_applied"),
        F.col("discount_bool").cast("boolean").alias("discount_bool"),
        F.col("Customer_Category").alias("customer_category"),
        F.col("Promotion").alias("promotion"),
        F.col("season_computed"),
        F.col("computed_total_items").cast("int").alias("computed_total_items"),
        F.col("item_qty_est").cast("double").alias("item_qty_est"),
        F.col("item_revenue_est").cast("double").alias("item_revenue_est")
    )
)

# Save as a Delta table (Unity Catalog)
catalog = "retail_catalog"
schema  = "retail_schema"
spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE {schema}")

silver_table = f"{catalog}.{schema}.transactions_items_silver"

(df_silver
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("event_date")            # good for time-based queries
 .option("overwriteSchema", "true")
 .saveAsTable(silver_table))

# Quick visual check (no spark.sql needed)
display(spark.table(silver_table).limit(10))


transaction_id,event_ts,event_date,customer_name,product,products,total_cost,payment_method,city,store_type,discount_applied,discount_bool,customer_category,promotion,season_computed,computed_total_items,item_qty_est,item_revenue_est
1000000991,2022-11-29T00:35:28.000Z,2022-11-29,Mrs. Stephanie Spencer,Baby Wipes,"List(Baby Wipes, Butter, Lawn Mower, Toilet Paper)",61.89,Cash,Miami,Warehouse Club,True,True,Student,Discount on Selected Items,Fall,4,1.0,15.4725
1000000991,2022-11-29T00:35:28.000Z,2022-11-29,Mrs. Stephanie Spencer,Butter,"List(Baby Wipes, Butter, Lawn Mower, Toilet Paper)",61.89,Cash,Miami,Warehouse Club,True,True,Student,Discount on Selected Items,Fall,4,1.0,15.4725
1000000991,2022-11-29T00:35:28.000Z,2022-11-29,Mrs. Stephanie Spencer,Lawn Mower,"List(Baby Wipes, Butter, Lawn Mower, Toilet Paper)",61.89,Cash,Miami,Warehouse Club,True,True,Student,Discount on Selected Items,Fall,4,1.0,15.4725
1000000991,2022-11-29T00:35:28.000Z,2022-11-29,Mrs. Stephanie Spencer,Toilet Paper,"List(Baby Wipes, Butter, Lawn Mower, Toilet Paper)",61.89,Cash,Miami,Warehouse Club,True,True,Student,Discount on Selected Items,Fall,4,1.0,15.4725
1000002802,2022-11-29T20:55:25.000Z,2022-11-29,Connor Patel,Eggs,"List(Eggs, Deodorant)",68.6,Credit Card,Atlanta,Convenience Store,True,True,Senior Citizen,,Fall,2,1.0,34.3
1000002802,2022-11-29T20:55:25.000Z,2022-11-29,Connor Patel,Deodorant,"List(Eggs, Deodorant)",68.6,Credit Card,Atlanta,Convenience Store,True,True,Senior Citizen,,Fall,2,1.0,34.3
1000002949,2022-11-29T06:14:59.000Z,2022-11-29,Daniel Aguilar,Apple,"List(Apple, Shaving Cream, Laundry Detergent, Shampoo, Cheese)",35.87,Debit Card,New York,Warehouse Club,True,True,Retiree,Discount on Selected Items,Fall,5,1.0,7.174
1000002949,2022-11-29T06:14:59.000Z,2022-11-29,Daniel Aguilar,Shaving Cream,"List(Apple, Shaving Cream, Laundry Detergent, Shampoo, Cheese)",35.87,Debit Card,New York,Warehouse Club,True,True,Retiree,Discount on Selected Items,Fall,5,1.0,7.174
1000002949,2022-11-29T06:14:59.000Z,2022-11-29,Daniel Aguilar,Laundry Detergent,"List(Apple, Shaving Cream, Laundry Detergent, Shampoo, Cheese)",35.87,Debit Card,New York,Warehouse Club,True,True,Retiree,Discount on Selected Items,Fall,5,1.0,7.174
1000002949,2022-11-29T06:14:59.000Z,2022-11-29,Daniel Aguilar,Shampoo,"List(Apple, Shaving Cream, Laundry Detergent, Shampoo, Cheese)",35.87,Debit Card,New York,Warehouse Club,True,True,Retiree,Discount on Selected Items,Fall,5,1.0,7.174


In [0]:
from pyspark.sql import functions as F

# Read Silver
silver_table = "retail_catalog.retail_schema.transactions_items_silver"
df_silver = spark.table(silver_table)

# Aggregate to daily product demand
gold_daily = (
    df_silver
    .groupBy("event_date", "product")   # To aggregate by store too, add "city","store_type" here
    .agg(
        F.sum("item_qty_est").alias("units"),
        F.sum("item_revenue_est").alias("revenue"),
        F.countDistinct("transaction_id").alias("transactions"),
        F.max(F.when((F.col("promotion").isNotNull()) & (F.col("promotion") != "None"), 1).otherwise(0)).alias("any_promo"),
        F.max(F.col("discount_bool").cast("int")).alias("any_discount")
    )
    .withColumn("avg_price_per_unit", F.when(F.col("units") > 0, F.col("revenue") / F.col("units")))
)

# Save Gold table
gold_table = "retail_catalog.retail_schema.demand_daily_gold"
(gold_daily
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("event_date")
 .option("overwriteSchema", "true")
 .saveAsTable(gold_table))

# Quick look
display(spark.table(gold_table).orderBy("event_date","product").limit(20))


event_date,product,units,revenue,transactions,any_promo,any_discount,avg_price_per_unit
2020-01-01,Air Freshener,21.0,369.4465,21,1,1,17.592690476190473
2020-01-01,Apple,26.0,668.952,26,1,1,25.72892307692308
2020-01-01,BBQ Sauce,25.0,461.642,25,1,1,18.46568
2020-01-01,Baby Wipes,21.0,448.80550000000005,21,1,1,21.37169047619048
2020-01-01,Banana,24.0,446.4943333333334,24,1,1,18.60393055555556
2020-01-01,Bath Towels,24.0,545.5206666666667,24,1,1,22.730027777777774
2020-01-01,Beef,19.0,314.53,19,1,1,16.55421052631579
2020-01-01,Bread,18.0,255.3781666666667,17,1,1,14.187675925925928
2020-01-01,Broom,20.0,317.2068333333333,19,1,1,15.860341666666669
2020-01-01,Butter,19.0,305.4991666666667,19,1,1,16.078903508771933


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W

# Read Gold (daily product demand)
gold_table = "retail_catalog.retail_schema.demand_daily_gold"
df_gold = spark.table(gold_table)

# Window per product ordered by date
w = W.partitionBy("product").orderBy("event_date")

# Add simple lags & moving averages (units + revenue)
df_feats = (
    df_gold
    .withColumn("lag_1d_units", F.lag("units", 1).over(w))
    .withColumn("lag_7d_units", F.lag("units", 7).over(w))
    .withColumn("ma_7d_units",  F.avg("units").over(w.rowsBetween(-6, 0)))
    .withColumn("ma_14d_units", F.avg("units").over(w.rowsBetween(-13, 0)))
    .withColumn("ma_28d_units", F.avg("units").over(w.rowsBetween(-27, 0)))

    .withColumn("lag_1d_revenue", F.lag("revenue", 1).over(w))
    .withColumn("ma_7d_revenue",  F.avg("revenue").over(w.rowsBetween(-6, 0)))
    .withColumn("ma_14d_revenue", F.avg("revenue").over(w.rowsBetween(-13, 0)))
    .withColumn("ma_28d_revenue", F.avg("revenue").over(w.rowsBetween(-27, 0)))
)

# Save as a new Gold table with features
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
(df_feats
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("event_date")
 .option("overwriteSchema", "true")
 .saveAsTable(features_table))

# Quick look
display(spark.table(features_table).orderBy("event_date", "product").limit(20))


event_date,product,units,revenue,transactions,any_promo,any_discount,avg_price_per_unit,lag_1d_units,lag_7d_units,ma_7d_units,ma_14d_units,ma_28d_units,lag_1d_revenue,ma_7d_revenue,ma_14d_revenue,ma_28d_revenue
2020-01-01,Air Freshener,21.0,369.4465,21,1,1,17.592690476190473,,,21.0,21.0,21.0,,369.4465,369.4465,369.4465
2020-01-01,Apple,26.0,668.952,26,1,1,25.72892307692308,,,26.0,26.0,26.0,,668.952,668.952,668.952
2020-01-01,BBQ Sauce,25.0,461.642,25,1,1,18.46568,,,25.0,25.0,25.0,,461.642,461.642,461.642
2020-01-01,Baby Wipes,21.0,448.80550000000005,21,1,1,21.37169047619048,,,21.0,21.0,21.0,,448.80550000000005,448.80550000000005,448.80550000000005
2020-01-01,Banana,24.0,446.4943333333334,24,1,1,18.60393055555556,,,24.0,24.0,24.0,,446.4943333333334,446.4943333333334,446.4943333333334
2020-01-01,Bath Towels,24.0,545.5206666666667,24,1,1,22.730027777777774,,,24.0,24.0,24.0,,545.5206666666667,545.5206666666667,545.5206666666667
2020-01-01,Beef,19.0,314.53,19,1,1,16.55421052631579,,,19.0,19.0,19.0,,314.53,314.53,314.53
2020-01-01,Bread,18.0,255.3781666666667,17,1,1,14.187675925925928,,,18.0,18.0,18.0,,255.3781666666667,255.3781666666667,255.3781666666667
2020-01-01,Broom,20.0,317.2068333333333,19,1,1,15.860341666666669,,,20.0,20.0,20.0,,317.2068333333333,317.2068333333333,317.2068333333333
2020-01-01,Butter,19.0,305.4991666666667,19,1,1,16.078903508771933,,,19.0,19.0,19.0,,305.4991666666667,305.4991666666667,305.4991666666667


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W

# Read features
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
df_feats = spark.table(features_table)

# Pick the latest date per product
w_desc = W.partitionBy("product").orderBy(F.desc("event_date"))
latest = (
    df_feats
    .withColumn("rn", F.row_number().over(w_desc))
    .filter("rn = 1")
    .drop("rn")
)

# Build next-day forecast
preds = (
    latest
    .withColumn("forecast_date", F.date_add(F.col("event_date"), 1))
    .withColumn(
        "pred_units",
        F.coalesce(
            F.col("ma_7d_units"),
            F.col("lag_1d_units"),
            F.col("units"),
            F.lit(1.0)
        ).cast("double")
    )
    .select(
        "product",
        F.col("event_date").alias("as_of_date"),
        "forecast_date",
        F.round(F.col("pred_units"), 3).alias("pred_units")
    )
)

# Save predictions
pred_table = "retail_catalog.retail_schema.demand_forecast_baseline"
(preds
 .write
 .mode("overwrite")
 .format("delta")
 .option("overwriteSchema", "true")
 .saveAsTable(pred_table))

# Quick look
display(spark.table(pred_table).orderBy("product").limit(20))


product,as_of_date,forecast_date,pred_units
Air Freshener,2024-05-18,2024-05-19,22.143
Apple,2024-05-18,2024-05-19,21.857
BBQ Sauce,2024-05-18,2024-05-19,19.429
Baby Wipes,2024-05-18,2024-05-19,22.429
Banana,2024-05-18,2024-05-19,23.286
Bath Towels,2024-05-18,2024-05-19,24.714
Beef,2024-05-18,2024-05-19,25.0
Bread,2024-05-18,2024-05-19,21.286
Broom,2024-05-18,2024-05-19,23.0
Butter,2024-05-18,2024-05-19,22.429


In [0]:
from pyspark.sql import functions as F

# Read features
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
df = spark.table(features_table)

# Use the same baseline logic: predict next-day units from today's MA(7) (fallbacks: lag_1d -> units -> 1.0)
pred_source = (
    df
    .withColumn(
        "pred_units",
        F.coalesce(F.col("ma_7d_units"), F.col("lag_1d_units"), F.col("units"), F.lit(1.0)).cast("double")
    )
    .select("product", F.col("event_date").alias("as_of_date"), "pred_units")
)

# Actual units for each date
actuals = df.select("product", F.col("event_date").alias("event_date"), F.col("units").alias("actual_units"))

# Align predictions made on day t to actuals on day t+1
bt = (
    pred_source
    .withColumn("event_date", F.date_add(F.col("as_of_date"), 1))
    .join(actuals, ["product", "event_date"], "inner")
)

# Errors and metrics
eval_df = (
    bt
    .withColumn("abs_err", F.abs(F.col("actual_units") - F.col("pred_units")))
    .withColumn("sq_err",  (F.col("actual_units") - F.col("pred_units"))**2)
    .withColumn("ape",     F.when(F.col("actual_units") > 0, F.col("abs_err") / F.col("actual_units")))
)

metrics_overall = eval_df.agg(
    F.count("*").alias("n_rows"),
    F.avg("actual_units").alias("avg_actual"),
    F.avg("pred_units").alias("avg_pred"),
    F.avg("abs_err").alias("MAE"),
    F.sqrt(F.avg("sq_err")).alias("RMSE"),
    (F.avg("ape") * 100).alias("MAPE_pct")
)

display(metrics_overall)
display(eval_df.orderBy(F.desc("event_date")).limit(20))


n_rows,avg_actual,avg_pred,MAE,RMSE,MAPE_pct
129519,23.150989430122223,23.152098993381745,4.15380935100899,5.212214573495697,19.402201026275787


product,event_date,as_of_date,pred_units,actual_units,abs_err,sq_err,ape
Tea,2024-05-18,2024-05-17,23.714285714285715,18.0,5.714285714285715,32.65306122448981,0.3174603174603175
Yogurt,2024-05-18,2024-05-17,24.0,16.0,8.0,64.0,0.5
Shrimp,2024-05-18,2024-05-17,24.571428571428573,24.0,0.571428571428573,0.3265306122448997,0.0238095238095238
Trash Cans,2024-05-18,2024-05-17,21.285714285714285,30.0,8.714285714285715,75.9387755102041,0.2904761904761905
Syrup,2024-05-18,2024-05-17,20.571428571428573,17.0,3.571428571428573,12.755102040816338,0.2100840336134454
Razors,2024-05-18,2024-05-17,21.571428571428573,16.0,5.571428571428573,31.040816326530628,0.3482142857142858
Mayonnaise,2024-05-18,2024-05-17,25.0,15.0,10.0,100.0,0.6666666666666666
Paper Towels,2024-05-18,2024-05-17,25.714285714285715,15.0,10.714285714285715,114.79591836734696,0.7142857142857143
Onions,2024-05-18,2024-05-17,22.285714285714285,22.0,0.2857142857142847,0.0816326530612239,0.0129870129870129
Mustard,2024-05-18,2024-05-17,27.428571428571427,13.0,14.428571428571429,208.1836734693877,1.1098901098901095


In [0]:
# One cell: build label, convert to pandas, train HistGradientBoosting, evaluate

from pyspark.sql import functions as F, Window as W
import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ---------- Load features ----------
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
df = spark.table(features_table)

# ---------- Build ML frame: label = next day's units (t+1) ----------
w = W.partitionBy("product").orderBy("event_date")
df_ml = (
    df
    .withColumn("label", F.lead("units", 1).over(w))   # predict t+1
    .dropna(subset=["label"])
)

feature_cols = [
    "units", "lag_1d_units", "lag_7d_units",
    "ma_7d_units", "ma_14d_units", "ma_28d_units",
    "revenue", "avg_price_per_unit",
    "any_promo", "any_discount"
]

# Fill nulls in features with 0 to keep it simple
df_ml = df_ml.fillna(0, subset=feature_cols)

# Select only the needed columns and convert to pandas
sel_cols = ["product", "event_date", "label"] + feature_cols
pdf = df_ml.select(*sel_cols).toPandas()

# Ensure event_date is datetime and sort
pdf["event_date"] = pd.to_datetime(pdf["event_date"])
pdf = pdf.sort_values(["product", "event_date"]).reset_index(drop=True)

# ---------- Time-based split (safe) ----------
# 80th percentile date across the whole dataset
cutoff = pdf["event_date"].quantile(0.80)
train = pdf[pdf["event_date"] <= cutoff].copy()
test  = pdf[pdf["event_date"]  > cutoff].copy()

# Fallbacks if split is empty/tiny
if test.empty or train.empty:
    last_date = pdf["event_date"].max()
    train = pdf[pdf["event_date"] < last_date].copy()
    test  = pdf[pdf["event_date"] == last_date].copy()
    if test.empty or train.empty:
        # final fallback: random shuffle split
        pdf_shuf = pdf.sample(frac=1.0, random_state=42).reset_index(drop=True)
        split_idx = int(0.8 * len(pdf_shuf))
        train, test = pdf_shuf.iloc[:split_idx].copy(), pdf_shuf.iloc[split_idx:].copy()

X_train, y_train = train[feature_cols].values, train["label"].values
X_test,  y_test  = test[feature_cols].values,  test["label"].values

# ---------- Train a fast, strong baseline: HistGradientBoosting ----------
hgb = HistGradientBoostingRegressor(
    max_depth=6,
    max_iter=300,
    learning_rate=0.05,
    random_state=42
)
hgb.fit(X_train, y_train)

# ---------- Predict & evaluate ----------
y_pred = hgb.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# MAPE (ignore zeros in actuals)
mask = y_test != 0
mape = float(np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100) if mask.any() else np.nan

print("HistGradientBoosting Evaluation:")
print(f"  MAE  = {mae:.4f}")
print(f"  RMSE = {rmse:.4f}")
print(f"  MAPE = {mape:.2f}%")

# Peek recent predictions
preview = test.assign(pred_units=y_pred)
preview = preview.sort_values(["event_date", "product"]).tail(20)
preview_display = preview[["product", "event_date", "label", "pred_units"]]
display(preview_display)


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f5f373ebd80>
Traceback (most recent call last):
  File "/databricks/python/lib/python3.12/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/databricks/python/lib/python3.12/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.12/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/databricks/python/lib/python3.12/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'split'


HistGradientBoosting Evaluation:
  MAE  = 3.8905
  RMSE = 4.8837
  MAPE = 18.18%


product,event_date,label,pred_units
Shower Gel,2024-05-17T00:00:00.000Z,12.0,22.9087703518778
Shrimp,2024-05-17T00:00:00.000Z,24.0,22.88054190220649
Soap,2024-05-17T00:00:00.000Z,21.0,22.85163637507024
Soda,2024-05-17T00:00:00.000Z,25.0,22.837935093569424
Spinach,2024-05-17T00:00:00.000Z,15.0,22.82913132429655
Sponges,2024-05-17T00:00:00.000Z,18.0,22.94511907162759
Syrup,2024-05-17T00:00:00.000Z,17.0,22.93990659749638
Tea,2024-05-17T00:00:00.000Z,18.0,22.8248422783878
Tissues,2024-05-17T00:00:00.000Z,23.0,22.913522621985717
Toilet Paper,2024-05-17T00:00:00.000Z,15.0,22.88770154776009


In [0]:
# Ensure Spark is active (attach to a running cluster first if needed)
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Quick sanity checks
print("Spark version:", spark.version)
print("Test count:", spark.range(1).count())


Spark version: 4.0.0
Test count: 1


In [0]:
# Uses the trained sklearn model `hgb` to predict tomorrow's units per product
# Saves to: retail_catalog.retail_schema.demand_forecast_gbt

from pyspark.sql import functions as F
import pandas as pd
import numpy as np

# --- Config ---
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
pred_table     = "retail_catalog.retail_schema.demand_forecast_gbt"

# --- Make sure model exists ---
assert "hgb" in globals(), "Model 'hgb' not found. Please re-run the previous training cell to create 'hgb'."

# --- Read features and get the latest date per product ---
df_feat = spark.table(features_table)

latest_per_product = (
    df_feat
    .groupBy("product")
    .agg(F.max("event_date").alias("as_of_date"))
)

# Feature set used during training
feature_cols = [
    "units", "lag_1d_units", "lag_7d_units",
    "ma_7d_units", "ma_14d_units", "ma_28d_units",
    "revenue", "avg_price_per_unit",
    "any_promo", "any_discount"
]

latest_rows = (
    df_feat.alias("f")
    .join(latest_per_product.alias("m"), on="product", how="inner")
    .where(F.col("f.event_date") == F.col("m.as_of_date"))
    .select(
        F.col("f.product"),
        F.col("f.event_date").alias("as_of_date"),
        *[F.col(f"f.{c}") for c in feature_cols]
    )
)

# Convert to pandas & prepare features
pdf_latest = latest_rows.fillna(0, subset=feature_cols).toPandas()
X_latest = pdf_latest[feature_cols].values

# Predict next-day units and clip to non-negative
pred = hgb.predict(X_latest)
pred = np.clip(pred, 0, None)

# Build forecast dataframe (t+1)
pdf_pred = pd.DataFrame({
    "product": pdf_latest["product"],
    "as_of_date": pd.to_datetime(pdf_latest["as_of_date"]),
    "forecast_date": pd.to_datetime(pdf_latest["as_of_date"]) + pd.Timedelta(days=1),
    "pred_units": np.round(pred, 3)
})

# Save to Delta
df_pred = spark.createDataFrame(pdf_pred)
(df_pred
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("forecast_date")
 .option("overwriteSchema", "true")
 .saveAsTable(pred_table))

# Preview
display(spark.table(pred_table).orderBy("product").limit(100))


product,as_of_date,forecast_date,pred_units
Air Freshener,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.804
Apple,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.84
BBQ Sauce,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.906
Baby Wipes,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.92
Banana,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.864
Bath Towels,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.901
Beef,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.746
Bread,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.827
Broom,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.776
Butter,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.799


In [0]:
# Reusable scorer: generates next-day (t+1) forecasts per product using the trained sklearn model `hgb`

from pyspark.sql import functions as F
import pandas as pd
import numpy as np

# 1) Feature list (same as used during training)
feature_cols = [
    "units", "lag_1d_units", "lag_7d_units",
    "ma_7d_units", "ma_14d_units", "ma_28d_units",
    "revenue", "avg_price_per_unit",
    "any_promo", "any_discount"
]

# 2) Scoring function
def forecast_next_day(df_features_spark, model, feature_cols):
    """
    Produces t+1 forecasts per product from the latest available date in df_features_spark.
    Returns a Spark DataFrame: product, as_of_date, forecast_date, pred_units
    """
    assert model is not None, "Model is required"
    
    # Latest date per product
    latest_per_product = (
        df_features_spark.groupBy("product")
        .agg(F.max("event_date").alias("as_of_date"))
    )
    
    # Join features for those latest rows
    latest_rows = (
        df_features_spark.alias("f")
        .join(latest_per_product.alias("m"), on="product", how="inner")
        .where(F.col("f.event_date") == F.col("m.as_of_date"))
        .select(
            F.col("f.product"),
            F.col("f.event_date").alias("as_of_date"),
            *[F.col(f"f.{c}") for c in feature_cols]
        )
        .fillna(0, subset=feature_cols)  # keep predictions robust
    )
    
    # Convert to pandas, predict, and build output
    pdf = latest_rows.toPandas()
    X = pdf[feature_cols].values
    yhat = model.predict(X)
    yhat = np.clip(yhat, 0, None)  # no negative units
    
    pdf_pred = pd.DataFrame({
        "product": pdf["product"],
        "as_of_date": pd.to_datetime(pdf["as_of_date"]),
        "forecast_date": pd.to_datetime(pdf["as_of_date"]) + pd.Timedelta(days=1),
        "pred_units": np.round(yhat, 3)
    })
    
    return spark.createDataFrame(pdf_pred)

# 3) Use the scorer on your features table and save
features_table = "retail_catalog.retail_schema.demand_daily_features_gold"
pred_table     = "retail_catalog.retail_schema.demand_forecast_gbt"

df_features = spark.table(features_table)

# Make sure your trained sklearn model 'hgb' exists in the notebook session
assert "hgb" in globals(), "Model 'hgb' not found. Re-run the training cell to create it."

df_forecast = forecast_next_day(df_features, hgb, feature_cols)

# Save (append so you can keep history per forecast_date)
(df_forecast
 .write
 .mode("append")
 .format("delta")
 .partitionBy("forecast_date")
 .saveAsTable(pred_table))

# Quick look
display(df_forecast.orderBy("product").limit(20))


product,as_of_date,forecast_date,pred_units
Air Freshener,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.804
Apple,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.84
BBQ Sauce,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.906
Baby Wipes,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.92
Banana,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.864
Bath Towels,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.901
Beef,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.746
Bread,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.827
Broom,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.776
Butter,2024-05-18T00:00:00.000Z,2024-05-19T00:00:00.000Z,22.799


In [0]:
from pyspark.sql import functions as F

# Read item-level Silver
silver_table = "retail_catalog.retail_schema.transactions_items_silver"
df_silver = spark.table(silver_table)

# Aggregate per (date, product, city, store_type)
gold_store_daily = (
    df_silver
    .groupBy("event_date", "product", "city", "store_type")
    .agg(
        F.sum("item_qty_est").alias("units"),
        F.sum("item_revenue_est").alias("revenue"),
        F.countDistinct("transaction_id").alias("transactions"),
        F.max(F.when((F.col("promotion").isNotNull()) & (F.col("promotion") != "None"), 1).otherwise(0)).alias("any_promo"),
        F.max(F.col("discount_bool").cast("int")).alias("any_discount")
    )
    .withColumn("avg_price_per_unit", F.when(F.col("units") > 0, F.col("revenue") / F.col("units")))
)

# Save as Delta
gold_store_table = "retail_catalog.retail_schema.demand_daily_store_gold"
(gold_store_daily
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("event_date")
 .option("overwriteSchema", "true")
 .saveAsTable(gold_store_table))

# Preview
display(spark.table(gold_store_table).orderBy("event_date","product","city","store_type").limit(20))


event_date,product,city,store_type,units,revenue,transactions,any_promo,any_discount,avg_price_per_unit
2020-01-01,Air Freshener,Atlanta,Specialty Store,1.0,82.87,1,1,0,82.87
2020-01-01,Air Freshener,Boston,Pharmacy,1.0,1.255,1,1,1,1.255
2020-01-01,Air Freshener,Boston,Specialty Store,1.0,17.85,1,0,1,17.85
2020-01-01,Air Freshener,Chicago,Warehouse Club,1.0,8.59,1,0,1,8.59
2020-01-01,Air Freshener,Houston,Convenience Store,2.0,40.72,2,1,1,20.36
2020-01-01,Air Freshener,Houston,Specialty Store,1.0,11.352,1,1,1,11.352
2020-01-01,Air Freshener,Los Angeles,Pharmacy,1.0,1.06,1,1,0,1.06
2020-01-01,Air Freshener,Miami,Department Store,1.0,17.416,1,1,0,17.416
2020-01-01,Air Freshener,Miami,Pharmacy,2.0,37.722,2,1,1,18.861
2020-01-01,Air Freshener,Miami,Supermarket,2.0,55.762,2,1,1,27.881


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W

# Read store-level Gold
gold_store_table = "retail_catalog.retail_schema.demand_daily_store_gold"
df_store = spark.table(gold_store_table)

# Window: per (product, city, store_type) ordered by event_date
w = W.partitionBy("product", "city", "store_type").orderBy("event_date")

# Add lags & moving averages for units and revenue
df_store_feats = (
    df_store
    .withColumn("lag_1d_units", F.lag("units", 1).over(w))
    .withColumn("lag_7d_units", F.lag("units", 7).over(w))
    .withColumn("ma_7d_units",  F.avg("units").over(w.rowsBetween(-6, 0)))
    .withColumn("ma_14d_units", F.avg("units").over(w.rowsBetween(-13, 0)))
    .withColumn("ma_28d_units", F.avg("units").over(w.rowsBetween(-27, 0)))

    .withColumn("lag_1d_revenue", F.lag("revenue", 1).over(w))
    .withColumn("ma_7d_revenue",  F.avg("revenue").over(w.rowsBetween(-6, 0)))
    .withColumn("ma_14d_revenue", F.avg("revenue").over(w.rowsBetween(-13, 0)))
    .withColumn("ma_28d_revenue", F.avg("revenue").over(w.rowsBetween(-27, 0)))
)

# Save as Delta
features_store_table = "retail_catalog.retail_schema.demand_daily_store_features_gold"
(df_store_feats
 .write
 .mode("overwrite")
 .format("delta")
 .partitionBy("event_date")
 .option("overwriteSchema", "true")
 .saveAsTable(features_store_table))

# Quick look
display(spark.table(features_store_table).orderBy("event_date", "product", "city", "store_type").limit(20))


event_date,product,city,store_type,units,revenue,transactions,any_promo,any_discount,avg_price_per_unit,lag_1d_units,lag_7d_units,ma_7d_units,ma_14d_units,ma_28d_units,lag_1d_revenue,ma_7d_revenue,ma_14d_revenue,ma_28d_revenue
2020-01-01,Air Freshener,Atlanta,Specialty Store,1.0,82.87,1,1,0,82.87,,,1.0,1.0,1.0,,82.87,82.87,82.87
2020-01-01,Air Freshener,Boston,Pharmacy,1.0,1.255,1,1,1,1.255,,,1.0,1.0,1.0,,1.255,1.255,1.255
2020-01-01,Air Freshener,Boston,Specialty Store,1.0,17.85,1,0,1,17.85,,,1.0,1.0,1.0,,17.85,17.85,17.85
2020-01-01,Air Freshener,Chicago,Warehouse Club,1.0,8.59,1,0,1,8.59,,,1.0,1.0,1.0,,8.59,8.59,8.59
2020-01-01,Air Freshener,Houston,Convenience Store,2.0,40.72,2,1,1,20.36,,,2.0,2.0,2.0,,40.72,40.72,40.72
2020-01-01,Air Freshener,Houston,Specialty Store,1.0,11.352,1,1,1,11.352,,,1.0,1.0,1.0,,11.352,11.352,11.352
2020-01-01,Air Freshener,Los Angeles,Pharmacy,1.0,1.06,1,1,0,1.06,,,1.0,1.0,1.0,,1.06,1.06,1.06
2020-01-01,Air Freshener,Miami,Department Store,1.0,17.416,1,1,0,17.416,,,1.0,1.0,1.0,,17.416,17.416,17.416
2020-01-01,Air Freshener,Miami,Pharmacy,2.0,37.722,2,1,1,18.861,,,2.0,2.0,2.0,,37.722,37.722,37.722
2020-01-01,Air Freshener,Miami,Supermarket,2.0,55.762,2,1,1,27.881,,,2.0,2.0,2.0,,55.762,55.762,55.762
