In [None]:
df = spark.read.option("header", True).csv("/Volumes/retail_catalog/retail_schema/retail_volume/Retail_Transactions_Dataset.csv")
display(df.limit(5))

In [None]:
# Replace with your actual catalog/schema/volume names
file_path = "/Volumes/retail_catalog/retail_schema/retail_volume/Retail_Transactions_Dataset.csv"

# Load CSV into Spark DataFrame
df = spark.read.option("header", True).option("inferSchema", True).csv(file_path)

# Show first few rows
display(df.limit(5))

# Check schema
df.printSchema()


In [None]:

from pyspark.sql import functions as F, types as T

# 1) Convert Date to timestamp
df = df.withColumn("event_ts", F.to_timestamp(F.col("Date"), "dd-MM-yyyy HH:mm")) \
       .withColumn("event_date", F.to_date("event_ts"))
display(df.limit(5))

In [None]:

# 2) Normalize text fields
df = df.withColumn("city", F.initcap("City")) \
       .withColumn("store_type", F.initcap("Store_Type")) \
       .withColumn("payment_method", F.initcap("Payment_Method"))
display(df.limit(5))

In [None]:

# 3) Parse Product list (convert single quotes to double quotes, then parse JSON)
df = df.withColumn("products_json", F.regexp_replace(F.col("Product"), "'", '"')) \
       .withColumn("products", F.from_json("products_json", T.ArrayType(T.StringType()))) \
       .drop("products_json")
display(df.limit(5))

In [None]:

# 4) Explode products into individual rows
df_items = df.withColumn("product", F.explode_outer("products"))

display(df_items.limit(5))


In [None]:
from pyspark.sql import functions as F

# Recompute using product count from the parsed array
df_items = (df_items
    .withColumn("computed_total_items", F.size("products"))
    # Each product in the list is 1 unit
    .withColumn("item_qty_est", F.when(F.col("computed_total_items") > 0, F.lit(1.0)).otherwise(F.lit(None)))
    # Split the basket total_cost equally across the products
    .withColumn("item_revenue_est",
                F.when(F.col("computed_total_items") > 0,
                       F.col("Total_Cost") / F.col("computed_total_items"))
                 .otherwise(F.lit(None)))
)


display(df_items.limit(10))




In [None]:
df_items = df_items.filter(F.col("computed_total_items") > 0)