In [0]:
import re
 
def rename_columns_to_snake_case(df):
    def camel_to_snake(name):
        s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
        name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
        return name.lower().replace(" ", "_")
    return df.toDF(*[camel_to_snake(c) for c in df.columns])
 
def clean_column_names(df):
    new_cols = []
    for c in df.columns:
        clean = re.sub(r"[ ,;{}()\n\t=]", "_", c)
        clean = re.sub("_+", "_", clean)
        new_cols.append(clean.lower())
    return df.toDF(*new_cols)

In [0]:
product_df = spark.read.format("delta").table("assignment_adf.sales_view.product")
store_df   = spark.read.format("delta").table("assignment_adf.sales_view.store")
sales_df   = spark.read.format("delta").table("assignment_adf.sales_view.sales")
 
#Apply cleanup+renaming
product_df = clean_column_names(rename_columns_to_snake_case(product_df))
store_df   = clean_column_names(rename_columns_to_snake_case(store_df))
sales_df   = clean_column_names(rename_columns_to_snake_case(sales_df))
 
display(product_df)
display(store_df)
display(sales_df)

In [0]:
df_product_sel = product_df.select(
    "store_id",
    "product_id",
    "product_name",
    "product_code",
    "description",
    "category_id",
    "price",
    "stock_quantity",
    "supplier_id",
    "created_at",
    "updated_at",
    "image_url",
    "weight",
    "expiry_date",
    "is_active",
    "tax_rate"
).withColumnRenamed("created_at", "product_created_at") \
 .withColumnRenamed("updated_at", "product_updated_at")
df_store_sel = store_df.select(
    "store_id",
    "store_name",
    "location",
    "manager_name"
)
df_store_product = (
    df_product_sel.join(df_store_sel, "store_id", "inner")
)

In [0]:
df_sales_sel = sales_df.select(
    "order_date",
    "category",
    "city",
    "customer_id",
    "order_id",
    "product_id",
    "profit",
    "region",
    "sales",
    "segment",
    "ship_date",
    "ship_mode",
    "latitude",
    "longitude"
)

In [0]:
df_final = (
    df_sales_sel
    .join(df_store_product, "product_id", "inner")
    .select(
        "order_date",
        "category",
        "city",
        "customer_id",
        "order_id",
        "product_id",
        "profit",
        "region",
        "sales",
        "segment",
        "ship_date",
        "ship_mode",
        "latitude",
        "longitude",
        "store_name",
        "location",
        "manager_name",
        "product_name",
        "price",
        "stock_quantity",
        "image_url"
    )
)


In [0]:
from delta.tables import DeltaTable
DeltaTable.createOrReplace(spark) \
    .tableName(table_name)
table_name = "gold_transformation.sales_view.StoreProductSalesAnalysis"
(
    df_final.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(table_name)
)

In [0]:
df = spark.table("gold_transformation.sales_view.StoreProductSalesAnalysis")
df.write.mode("overwrite").format("delta").save("abfss://databricks-layers@storagekevinav.dfs.core.windows.net/storeproductsalesanalysis/")