# Dimension Modeling

## Dim_model Sink - Initial and Incremental run

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, IntegerType, LongType
from pyspark.sql.window import Window

# Get source data
df_src = spark.sql('''
    SELECT DISTINCT             product_key,
                                product_code,
                                product_name,
                                brand,
                                color,
                                weight,
                                weight_unit,
                                price,
                                cost
    FROM parquet.`abfss://silver@contosoprojectstorage.dfs.core.windows.net/contoso_sales`
''')

# Initialize target DataFrame
if spark.catalog.tableExists('contoso_catalog.gold.dim_product'):
    df_tgt = spark.sql('''
        SELECT surrogate_key, product_key,
                                product_code,
                                product_name,
                                brand,
                                color,
                                weight,
                                weight_unit,
                                price,
                                cost
        FROM contoso_catalog.gold.dim_product
    ''')
else:
    df_tgt = df_src.withColumn("surrogate_key", F.lit(0).cast(LongType()))\
                   .withColumn("is_current", F.lit(1).cast(IntegerType()))\
                   .withColumn("start_date", F.current_date())\
                   .withColumn("end_date", F.lit(None).cast(DateType()))\
                   .filter("1 = 0")  # Empty Schema

def generate_surrogate_key(df, start_value):
    w = Window.orderBy("product_key")
    return df.withColumn("surrogate_key", F.row_number().over(w) + F.lit(start_value))

def apply_scd_type2_changes(df_src, df_tgt):
    if df_tgt.rdd.isEmpty():
        return generate_surrogate_key(df_src, 0)\
               .withColumn("start_date", F.current_date())\
               .withColumn("end_date", F.lit(None).cast(DateType()))\
               .withColumn("is_current", F.lit(1).cast(IntegerType()))

    max_surrogate_key = df_tgt.agg(F.max("surrogate_key")).collect()[0][0]
    
    src = df_src.alias("src")
    tgt = df_tgt.filter(F.col("is_current") == 1).alias("tgt")
    
    joined_df = src.join(tgt, "product_key", "outer")
    
    new_records = joined_df.filter(F.col("tgt.surrogate_key").isNull())
    
    if new_records.count() > 0:
        new_records = generate_surrogate_key(new_records, max_surrogate_key + 1)\
                     .withColumn("start_date", F.current_date())\
                     .withColumn("end_date", F.lit(None).cast(DateType()))\
                     .withColumn("is_current", F.lit(1).cast(IntegerType()))

    changed_records = joined_df.filter((F.col("tgt.surrogate_key").isNotNull()) &
                                       (
                                        (F.coalesce(src.product_code != tgt.product_code, F.lit(False)))|
                                        (F.coalesce(src.product_name != tgt.product_name, F.lit(False)))|
                                        (F.coalesce(src.brand != tgt.brand, F.lit(False)))|
                                        (F.coalesce(src.color != tgt.color, F.lit(False)))|
                                        (F.coalesce(src.weight != tgt.weight, F.lit(False)))|
                                        (F.coalesce(src.weight_unit != tgt.weight_unit, F.lit(False)))|
                                        (F.coalesce(src.price != tgt.price, F.lit(False)))|
                                        (F.coalesce(src.cost != tgt.cost, F.lit(False)))
                                       )
                                        )   
    
    if changed_records.count() > 0:
        new_versions = changed_records.select("src.product_key", "src.product_code", "src.product_name", "src.brand", "src.color", "src.weight", "src.weight_unit", "src.price", "src.cost")
        
        start_key = max_surrogate_key + (new_records.count() if new_records.count() > 0 else 0) + 1
        new_versions = generate_surrogate_key(new_versions, start_key)\
                       .withColumn("start_date", F.current_date())\
                       .withColumn("end_date", F.lit(None).cast(DateType()))\
                       .withColumn("is_current", F.lit(1).cast(IntegerType()))

        old_versions = df_tgt.join(changed_records.select("product_key"), "product_key", "inner")\
                             .withColumn("end_date", F.when(F.col("is_current") == 1, F.date_sub(F.current_date(), 1)).otherwise(F.col("end_date")))\
                             .withColumn("is_current", F.when(F.col("is_current") == 1, F.lit(0)).otherwise(F.col("is_current")))

        unchanged_records = df_tgt.join(changed_records.select("product_key"), "product_key", "leftanti")
        
        if new_records.count() > 0:
            final_df = unchanged_records\
                                    .unionAll(old_versions)\
                                    .unionAll(new_records)\
                                    .unionAll(new_versions)
        else:
            final_df = unchanged_records\
                                    .unionAll(old_versions)\
                                    .unionAll(new_versions)

    
    return final_df

# Apply SCD Type 2 changes
result_df = apply_scd_type2_changes(df_src, df_tgt)

display(result_df)



product_key,product_code,product_name,brand,color,weight,weight_unit,price,cost,surrogate_key,start_date,end_date,is_current
1,101001,Contoso 512MB MP3 Player E51,Contoso,Silver,4.8,ounces,12.99,6.62,1,2025-01-11,,1
4,101004,Contoso 2G MP3 Player E200,Contoso,Silver,4.5,ounces,21.57,11.0,2,2025-01-11,,1
5,101005,Contoso 2G MP3 Player E200,Contoso,Red,2.4,ounces,21.57,11.0,3,2025-01-11,,1
6,101006,Contoso 2G MP3 Player E200,Contoso,Black,8.8,ounces,21.57,11.0,4,2025-01-11,,1
7,101007,Contoso 2G MP3 Player E200,Contoso,Blue,2.1,ounces,21.57,11.0,5,2025-01-11,,1
8,101008,Contoso 4G MP3 Player E400,Contoso,Silver,5.6,ounces,59.99,30.58,6,2025-01-11,,1
9,101009,Contoso 4G MP3 Player E400,Contoso,Black,2.1,ounces,59.99,30.58,7,2025-01-11,,1
11,101011,Contoso 4G MP3 Player E400,Contoso,Orange,14.1,ounces,59.99,30.58,8,2025-01-11,,1
15,101015,Contoso 4GB Flash MP3 Player E401,Contoso,White,2.1,ounces,77.68,35.72,9,2025-01-11,,1
16,101016,Contoso 8GB Super-Slim MP3/Video Player M800,Contoso,White,11.0,ounces,109.95,50.56,10,2025-01-11,,1


In [0]:
final_df.write.format('delta')\
                .mode('append')\
                    .option('path', 'abfss://gold@contosoprojectstorage.dfs.core.windows.net/dim_product')\
                        .saveAsTable('contoso_catalog.gold.dim_product')



In [0]:
%sql
select * from contoso_catalog.gold.dim_product

product_key,product_code,product_name,brand,color,weight,weight_unit,price,cost,is_current,start_date,end_date
1,101001,Contoso 512MB MP3 Player E51,Contoso,Silver,4.8,ounces,12.99,6.62,1,2025-01-10,
4,101004,Contoso 2G MP3 Player E200,Contoso,Silver,4.5,ounces,21.57,11.0,1,2025-01-10,
5,101005,Contoso 2G MP3 Player E200,Contoso,Red,2.4,ounces,21.57,11.0,1,2025-01-10,
6,101006,Contoso 2G MP3 Player E200,Contoso,Black,8.8,ounces,21.57,11.0,1,2025-01-10,
7,101007,Contoso 2G MP3 Player E200,Contoso,Blue,2.1,ounces,21.57,11.0,1,2025-01-10,
8,101008,Contoso 4G MP3 Player E400,Contoso,Silver,5.6,ounces,59.99,30.58,1,2025-01-10,
9,101009,Contoso 4G MP3 Player E400,Contoso,Black,2.1,ounces,59.99,30.58,1,2025-01-10,
11,101011,Contoso 4G MP3 Player E400,Contoso,Orange,14.1,ounces,59.99,30.58,1,2025-01-10,
15,101015,Contoso 4GB Flash MP3 Player E401,Contoso,White,2.1,ounces,77.68,35.72,1,2025-01-10,
16,101016,Contoso 8GB Super-Slim MP3/Video Player M800,Contoso,White,11.0,ounces,109.95,50.56,1,2025-01-10,
