In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable


CATALOG_DB = "sales_db"
SILVER_TABLE = f"{CATALOG_DB}.silver.orders_clean"

GOLD_SCHEMA = "gold"
spark.sql(f"CREATE DATABASE IF NOT EXISTS {CATALOG_DB}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_DB}.{GOLD_SCHEMA}")

# table names
dim_customer_table = f"{CATALOG_DB}.{GOLD_SCHEMA}.dim_customer"
dim_product_table  = f"{CATALOG_DB}.{GOLD_SCHEMA}.dim_product"
dim_geo_table      = f"{CATALOG_DB}.{GOLD_SCHEMA}.dim_geo"
dim_date_table     = f"{CATALOG_DB}.{GOLD_SCHEMA}.dim_date"
fact_sales_table   = f"{CATALOG_DB}.{GOLD_SCHEMA}.fact_sales"

s = spark.table(SILVER_TABLE)



In [0]:

# DIM TABLES

dim_customer_df = (
    s.select("customer_id", "customer_name", "segment")
     .dropDuplicates(["customer_id"])
)

dim_product_df = (
    s.select("product_id", "product_name", "category", "sub_category")
     .dropDuplicates(["product_id"])
)

dim_geo_df = (
    s.withColumn(
        "geo_id",
        F.sha2(
            F.concat_ws("||",
                F.coalesce(F.col("country"), F.lit("")),
                F.coalesce(F.col("state"), F.lit("")),
                F.coalesce(F.col("city"), F.lit("")),
                F.coalesce(F.col("postal_code"), F.lit("")),
                F.coalesce(F.col("region"), F.lit(""))
            ),
            256
        )
    )
    .select("geo_id", "country", "state", "city", "postal_code", "region")
    .dropDuplicates(["geo_id"])
)

dim_date_df = (
    s.select(F.col("order_date").alias("date"))
     .dropna(subset=["date"])
     .dropDuplicates(["date"])
     .withColumn("year", F.year("date"))
     .withColumn("month", F.month("date"))
     .withColumn("month_start", F.trunc("date", "month"))
     .withColumn("quarter", F.quarter("date"))
)


# FACT TABLES

fact_sales_df = (
    s.withColumn(
        "geo_id",
        F.sha2(
            F.concat_ws("||",
                F.coalesce(F.col("country"), F.lit("")),
                F.coalesce(F.col("state"), F.lit("")),
                F.coalesce(F.col("city"), F.lit("")),
                F.coalesce(F.col("postal_code"), F.lit("")),
                F.coalesce(F.col("region"), F.lit(""))
            ),
            256
        )
    )
    .select(
        "order_line_key",
        F.col("order_date").alias("date"),
        "customer_id",
        "product_id",
        "geo_id",
        "sales",
        "ship_delay_days"
    )
)

# CREATE TABLE IF NOT EXISTS + MERGE

def create_if_not_exists_and_merge(df, table_full_name, merge_condition):
    if not spark._jsparkSession.catalog().tableExists(table_full_name):
        (df.limit(0)
           .write
           .format("delta")
           .mode("overwrite")
           .saveAsTable(table_full_name)
        )
        #print(f" Created table: {table_full_name}")

    tgt = DeltaTable.forName(spark, table_full_name)
    (tgt.alias("tgt")
        .merge(df.alias("src"), merge_condition)
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )


# UPSERT GOLD TABLES

create_if_not_exists_and_merge(dim_customer_df, dim_customer_table, "tgt.customer_id = src.customer_id")
create_if_not_exists_and_merge(dim_product_df,  dim_product_table,  "tgt.product_id  = src.product_id")
create_if_not_exists_and_merge(dim_geo_df,      dim_geo_table,      "tgt.geo_id      = src.geo_id")
create_if_not_exists_and_merge(dim_date_df,     dim_date_table,     "tgt.date        = src.date")
create_if_not_exists_and_merge(fact_sales_df,   fact_sales_table,   "tgt.order_line_key = src.order_line_key")



In [0]:
print("dim_customer:", spark.table(dim_customer_table).count())
print("dim_product :", spark.table(dim_product_table).count())
print("dim_geo     :", spark.table(dim_geo_table).count())
print("dim_date    :", spark.table(dim_date_table).count())
print("fact_sales  :", spark.table(fact_sales_table).count())

dim_customer: 793
dim_product : 1861
dim_geo     : 628
dim_date    : 1230
fact_sales  : 9792


## DIMENSIONS AND FACT TABLES

### Table dim_customer

In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{GOLD_SCHEMA}.dim_customer limit 5"
spark.sql(query).show()
  

+-----------+-------------+--------+
|customer_id|customer_name| segment|
+-----------+-------------+--------+
|   AA-10315|   Alex Avila|Consumer|
|   AA-10375| Allen Armold|Consumer|
|   AA-10480| Andrew Allen|Consumer|
|   AA-10645|Anna Andreadi|Consumer|
|   AB-10015|Aaron Bergman|Consumer|
+-----------+-------------+--------+



### Table dim_date

In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{GOLD_SCHEMA}.dim_date limit 5"
spark.sql(query).show()


+----------+----+-----+-----------+-------+
|      date|year|month|month_start|quarter|
+----------+----+-----+-----------+-------+
|2018-05-28|2018|    5| 2018-05-01|      2|
|2018-08-10|2018|    8| 2018-08-01|      3|
|2018-03-17|2018|    3| 2018-03-01|      1|
|2018-06-06|2018|    6| 2018-06-01|      2|
|2018-10-05|2018|   10| 2018-10-01|      4|
+----------+----+-----+-----------+-------+



### Table dim_product

In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{GOLD_SCHEMA}.dim_product limit 5"
spark.sql(query).show()

+---------------+--------------------+---------+------------+
|     product_id|        product_name| category|sub_category|
+---------------+--------------------+---------+------------+
|FUR-BO-10000112|Bush Birmingham C...|Furniture|   Bookcases|
|FUR-BO-10000330|Sauder Camden Cou...|Furniture|   Bookcases|
|FUR-BO-10000362|Sauder Inglewood ...|Furniture|   Bookcases|
|FUR-BO-10000468|O'Sullivan 2-Shel...|Furniture|   Bookcases|
|FUR-BO-10000711|Hon Metal Bookcas...|Furniture|   Bookcases|
+---------------+--------------------+---------+------------+



### Table dim_geo

In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{GOLD_SCHEMA}.dim_geo limit 5"
spark.sql(query).show()

+--------------------+-------------+--------+--------------+-----------+-------+
|              geo_id|      country|   state|          city|postal_code| region|
+--------------------+-------------+--------+--------------+-----------+-------+
|001d88c17fc1af650...|United States|Michigan|      Westland|      48185|Central|
|002c9d42d7bb4cd8c...|United States|Michigan|Mount Pleasant|      48858|Central|
|017678bdc069ad70a...|United States| Florida|       Margate|      33063|  South|
|022cfd624bca580c5...|United States|Oklahoma|        Edmond|      73034|Central|
|02d25de51e3f5ecd9...|United States|Missouri|     Gladstone|      64118|Central|
+--------------------+-------------+--------+--------------+-----------+-------+



### Table fact_sales

In [0]:
query = f"SELECT * FROM {CATALOG_DB}.{GOLD_SCHEMA}.fact_sales limit 5"
spark.sql(query).show()

+--------------------+----------+-----------+---------------+--------------------+-------+---------------+
|      order_line_key|      date|customer_id|     product_id|              geo_id|  sales|ship_delay_days|
+--------------------+----------+-----------+---------------+--------------------+-------+---------------+
|008c40574e2e0ebe9...|2015-11-22|   RA-19885|OFF-EN-10004955|f1045d78a2b3fa3ad...|  16.23|              5|
|00ab76da6a4ee3084...|2015-11-11|   KD-16270|OFF-BI-10001359|9777f3ace55c675b1...| 896.99|              4|
|00c7c944d4c717398...|2015-11-24|   MY-18295|TEC-PH-10000560|bfdea3a205bdda234...|1049.97|              0|
|010a494502ea5d1f5...|2015-12-02|   DP-13165|OFF-BI-10000050|f23b3b1b97a3272dd...|  46.72|              2|
|0157dab6d45983e26...|2015-03-01|   VF-21715|FUR-CH-10000863|ee6e259246b919a6f...| 634.12|              4|
+--------------------+----------+-----------+---------------+--------------------+-------+---------------+

