In [0]:
# Databricks Notebook: 1_Data_Ingestion_and_Preprocessing
# In this notebook, we read raw e-commerce data from CSV/JSON sources,
# perform data cleaning (e.g., handling nulls, type conversions) and save the cleaned data as Delta tables.

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, regexp_replace

# Initialize Spark session 
spark = SparkSession.builder.appName("EcommerceDataIngestion").getOrCreate()

# Define data paths
raw_data_path = "dbfs:/mnt/data/raw/ecommerce_sales.csv"  
delta_table_path = "dbfs:/mnt/data/delta/ecommerce_sales_cleaned"

# Read raw data
df_raw = spark.read.option("header", "true").csv(raw_data_path)
display(df_raw)

order_id,sale_date,price,product_id,category
O1001,01/15/2023,$120.50,P101,Electronics
O1001,01/15/2023,$45.00,P102,Books
O1001,01/15/2023,$85.99,P103,Clothing
O1002,01/16/2023,$50.00,P102,Books
O1002,01/16/2023,$110.00,P104,Home
O1002,01/16/2023,$99.99,P105,Beauty
O1002,01/16/2023,$210.00,P101,Electronics
O1003,01/17/2023,$75.00,P103,Clothing
O1003,01/17/2023,$60.00,P106,Sports
O1004,01/18/2023,$120.00,P101,Electronics


In [0]:
# Data cleaning: Convert date columns and remove unwanted characters from price fields
df_cleaned = (
    df_raw
    .withColumn("sale_date", to_date(col("sale_date"), "MM/dd/yyyy"))
    .withColumn("price", regexp_replace(col("price"), "[$,]", "").cast("float"))
    .dropna(subset=["sale_date", "price", "product_id"])
)

# add extra transformations (e.g., categorization, computed columns)
df_cleaned = df_cleaned.withColumn("year", col("sale_date").substr(1,4).cast("int"))

In [0]:
# Write the cleaned data to a Delta table for further analysis
df_cleaned.write.mode("overwrite").format("delta").save(delta_table_path)
spark.sql("DROP TABLE IF EXISTS ecommerce_sales_cleaned")
spark.sql(f"CREATE TABLE ecommerce_sales_cleaned USING DELTA LOCATION '{delta_table_path}'")

display(df_cleaned)

order_id,sale_date,price,product_id,category,year
O1001,2023-01-15,120.5,P101,Electronics,2023
O1001,2023-01-15,45.0,P102,Books,2023
O1001,2023-01-15,85.99,P103,Clothing,2023
O1002,2023-01-16,50.0,P102,Books,2023
O1002,2023-01-16,110.0,P104,Home,2023
O1002,2023-01-16,99.99,P105,Beauty,2023
O1002,2023-01-16,210.0,P101,Electronics,2023
O1003,2023-01-17,75.0,P103,Clothing,2023
O1003,2023-01-17,60.0,P106,Sports,2023
O1004,2023-01-18,120.0,P101,Electronics,2023
