In [0]:
from pyspark.sql.functions import col, trim, lower, to_date, to_timestamp
from pyspark.sql.functions import regexp_extract


**Cleaning the Customers table**

In [0]:
df_customer = spark\
    .table("retail_demo.bronze.customers_raw")\
    .withColumn("country", trim(col("country")))\
    .withColumn("marital_status", lower(trim(col("marital_status"))))\
    .withColumn("snapshot_ts", to_date(col("snapshot_ts")))\
    .withColumn("ingest_ts", current_timestamp())
    .dropDuplicates(["customer_id", "snapshot_ts"])\
    .drop("_rescued_data", "_metadata")

df_customer.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("retail_demo.silver.customers_snapshot")


**Cleaning the Products table**

In [0]:
prod_df = spark\
    .table("retail_demo.bronze.products_raw")\
    .withColumn("product_name", trim(lower(col("product_name"))))\
    .withColumn("unit_price", col("unit_price").cast("decimal(10,2)"))\
    .dropDuplicates(["product_id"])

prod_df.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("retail_demo.silver.products")

**Cleaning the Orders table**

In [0]:
df_orders = spark\
    .table("retail_demo.bronze.orders_raw")\
    .withColumn("order_ts", to_timestamp(col("order_ts")))\
    .withColumn("quantity", col("quantity").cast("int"))\
    .withColumn("amount", col("amount").cast("decimal(12,2)"))\
    .dropDuplicates(["order_id"])

df_orders.write\
    .format("delta")\
    .mode("overwrite")\
    .partitionBy("order_date")\
    .saveAsTable("retail_demo.silver.orders")