# Dimension Modeling

In [0]:
%sql
select count(*) from parquet.`abfss://silver@contosoprojectstorage.dfs.core.windows.net/contoso_sales`

count(1)
10000


# Implementing SCD Type 2 (Add New Row): 
**Maintains full history by adding new rows for changes, using start/end dates and active flags.**

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType, IntegerType, LongType
from pyspark.sql.window import Window

# Get source data
df_src = spark.sql('''
    SELECT DISTINCT category_key, category_name
    FROM parquet.`abfss://silver@contosoprojectstorage.dfs.core.windows.net/contoso_sales`
''')

# Initialize target DataFrame
if spark.catalog.tableExists('contoso_catalog.gold.dim_categ'):
    df_tgt = spark.sql('''
        SELECT category_id, category_key, category_name, start_date, end_date, is_current
        FROM contoso_catalog.gold.dim_categ
    ''')
else:
    df_tgt = df_src.withColumn("category_id", F.lit(0).cast(LongType()))\
                   .withColumn("is_current", F.lit(1).cast(IntegerType()))\
                   .withColumn("start_date", F.current_date())\
                   .withColumn("end_date", F.lit(None).cast(DateType()))\
                   .filter("1 = 0")  # Empty Schema

def generate_surrogate_key(df, start_value):
    w = Window.orderBy("category_key")
    return df.withColumn("category_id", F.row_number().over(w) + F.lit(start_value))

def apply_scd_type2_changes(df_src, df_tgt):
    if df_tgt.rdd.isEmpty():
        return generate_surrogate_key(df_src, 0)\
               .withColumn("start_date", F.current_date())\
               .withColumn("end_date", F.lit(None).cast(DateType()))\
               .withColumn("is_current", F.lit(1).cast(IntegerType()))

    max_surrogate_key = df_tgt.agg(F.max("category_id")).collect()[0][0]
    
    src = df_src.alias("src")
    tgt = df_tgt.filter(F.col("is_current") == 1).alias("tgt")
    
    joined_df = src.join(tgt, "category_key", "outer")
    
    new_records = joined_df.filter(F.col("tgt.category_id").isNull())
    
    if new_records.count() > 0:
        new_records = generate_surrogate_key(new_records, max_surrogate_key + 1)\
                     .withColumn("start_date", F.current_date())\
                     .withColumn("end_date", F.lit(None).cast(DateType()))\
                     .withColumn("is_current", F.lit(1).cast(IntegerType()))
    new_records_count = new_records.count()

    changed_records = joined_df.filter((F.col("tgt.category_id").isNotNull()) &
                                       (F.coalesce(src.category_name != tgt.category_name, F.lit(False))))
    changed_records_count = changed_records.count()
    
    if new_records_count == 0 and changed_records_count == 0:
        return df_tgt
    
    if new_records_count > 0:
        new_records = generate_surrogate_key(new_records, max_surrogate_key + 1)\
                     .withColumn("start_date", F.current_date())\
                     .withColumn("end_date", F.lit(None).cast(DateType()))\
                     .withColumn("is_current", F.lit(1).cast(IntegerType()))
    
    if changed_records_count > 0:
        new_versions = changed_records.select(
            "src.*"
        ).drop("category_id", "start_date", "end_date", "is_current")
        
        start_key = max_surrogate_key + new_records_count + 1
        new_versions = generate_surrogate_key(new_versions, start_key)\
                      .withColumn("start_date", F.current_date())\
                      .withColumn("end_date", F.lit(None).cast(DateType()))\
                      .withColumn("is_current", F.lit(1).cast(IntegerType()))

        old_versions = df_tgt.join(changed_records.select("category_key"), "category_key", "inner")\
                            .withColumn("end_date", F.when(F.col("is_current") == 1, F.date_sub(F.current_date(), 1)).otherwise(F.col("end_date")))\
                            .withColumn("is_current", F.when(F.col("is_current") == 1, F.lit(0)).otherwise(F.col("is_current")))

        unchanged_records = df_tgt.join(changed_records.select("category_key"), "category_key", "leftanti")
        
        # Build the final DataFrame based on what we have
        final_df = unchanged_records
        
        if changed_records_count > 0:
            final_df = final_df.unionAll(old_versions).unionAll(new_versions)
            
        if new_records_count > 0:
            final_df = final_df.unionAll(new_records)
            
        return final_df

# Apply SCD Type 2 changes
result_df = apply_scd_type2_changes(df_src, df_tgt)

display(result_df)

category_id,category_key,category_name,start_date,end_date,is_current
1,1,Audio,2025-01-16,,1
2,2,TV and Video,2025-01-16,,1
3,3,Computers,2025-01-16,,1
4,4,Cameras and camcorders,2025-01-16,,1
5,5,Cell phones,2025-01-16,,1
6,6,"Music, Movies and Audio Books",2025-01-16,,1
7,7,Games and Toys,2025-01-16,,1
8,8,Home Appliances,2025-01-16,,1


# Write results to table

In [0]:

result_df.write.format('delta') \
                  .mode('overwrite') \
                  .option('path', 'abfss://gold@contosoprojectstorage.dfs.core.windows.net/dim_categ') \
                  .saveAsTable('contoso_catalog.gold.dim_categ')