In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.sql.types import *
from delta.tables import DeltaTable

# Create Flag Parameter

In [0]:
dbutils.widgets.text("p_incremental_flag","")
v_incre_flag = dbutils.widgets.get("p_incremental_flag")

In [0]:
dbutils.widgets.text("p_ingestion_date","")
v_ingest_date = dbutils.widgets.get("p_ingestion_date")

# Create DIMENSIONS MODEL

### Fetch Relative Columns

In [0]:
df_src = spark.sql(f"""
SELECT DISTINCT(Branch_ID) as Branch_ID, BranchName, ingestion_date
FROM PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`
WHERE ingestion_date = '{v_ingest_date}'
""")

In [0]:
df_src.display()

Branch_ID,BranchName,ingestion_date
BR9546,Premier Motors,2025-05-01
BR9666,Puma Motors,2025-05-01
XYZ9726,DataFam Motors,2025-05-01
BR9726,Power Ranger Motors,2025-05-01


### dim_model Sink - Initial and Incremental

In [0]:
if spark.catalog.tableExists('cars_catalog.gold.dim_branch'): # incremental
    # df_sink = spark.sql('''
    #                 SELECT dim_branch_key, Branch_ID, BranchName
    #                 from PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`
    #                 ''')
    df_sink = spark.sql('''
                    SELECT *
                    FROM DELTA.`abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_branch`
                    ''')

else: # initial
    df_sink = spark.sql('''
                        SELECT 1 as dim_branch_key, CAST(NULL AS TIMESTAMP) as updated_at, Branch_ID, BranchName
                        from PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`
                        WHERE 1=0
                        ''')

In [0]:
df_sink.display()

### Filtering new records and old records

In [0]:
df_filter = df_src.join(df_sink, df_src['Branch_ID'] == df_sink['Branch_ID'], how='left') \
      .select(df_src['Branch_ID'], df_src['BranchName'], df_sink['updated_at'], df_sink['dim_branch_key'])

In [0]:
df_filter.display()

Branch_ID,BranchName,updated_at,dim_branch_key
BR9546,Premier Motors,,
BR9666,Puma Motors,,
XYZ9726,DataFam Motors,,
BR9726,Power Ranger Motors,,


 **df_filter_old**

In [0]:
df_filter_old = df_filter.filter(col('dim_branch_key').isNotNull())

In [0]:
df_filter_old.display()

Branch_ID,BranchName,updated_at,dim_branch_key


 **df_filter_new**

In [0]:
df_filter_new = df_filter.filter(col('dim_branch_key').isNull()).select(col('Branch_ID'), col('BranchName'), col("updated_at"))
df_filter_new.display()

Branch_ID,BranchName,updated_at
BR9546,Premier Motors,
BR9666,Puma Motors,
XYZ9726,DataFam Motors,
BR9726,Power Ranger Motors,


# Create Surrogate Key

### Fetching the max Surrogate key from existing table

this Surrogate Key acts as the start point for incremental loading

In [0]:
if (v_incre_flag == '0'):
    max_value = 1
else:
    max_value_df = spark.sql("select max(dim_branch_key) from cars_catalog.gold.dim_branch")
    max_value = max_value_df.collect()[0][0] + 1

### Creating Surrogate Key column and ADD the max surrogate key

In [0]:
df_filter_new = df_filter_new.withColumn('dim_branch_key', max_value + monotonically_increasing_id())

In [0]:
df_filter_new.display()

Branch_ID,BranchName,updated_at,dim_branch_key
BR9546,Premier Motors,,1837
BR9666,Puma Motors,,1838
XYZ9726,DataFam Motors,,1839
BR9726,Power Ranger Motors,,1840


### Create Final DF = df_filter_old + df_filter_new

In [0]:
df_final = df_filter_new.union(df_filter_old)

In [0]:
df_final.display()

Branch_ID,BranchName,updated_at,dim_branch_key
BR9546,Premier Motors,,1837
BR9666,Puma Motors,,1838
XYZ9726,DataFam Motors,,1839
BR9726,Power Ranger Motors,,1840


# SCD - TYPE 1 (UPSERT)

In [0]:
from pyspark.sql.functions import current_timestamp,lit

In [0]:
cur_time_str = spark.sql("SELECT current_timestamp()").collect()[0][0].strftime("%Y-%m-%d %H:%M:%S.%f")

# incremental load
if spark.catalog.tableExists('cars_catalog.gold.dim_branch'): 
    deltaTable = DeltaTable.forPath(spark, "abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_branch")

    deltaTable.alias("tar").merge(df_final.alias("src"), "tar.dim_branch_key=src.dim_branch_key") \
                    .whenMatchedUpdate(set={
                        "Branch_ID": "src.Branch_ID",
                        "BranchName": "src.BranchName",
                        "updated_at": f"'{cur_time_str}'"
                        
                    }
                        
                    ) \
                    .whenNotMatchedInsert(values={
                        "Branch_ID": "src.Branch_ID",
                        "BranchName": "src.BranchName",
                        "dim_branch_key": "src.dim_branch_key",
                        "updated_at": f"'{cur_time_str}'"
                    }) \
                    .execute()
    spark.sql(f"""
                UPDATE cars_catalog.default.metadata_table
                SET last_updated_time = '{cur_time_str}'
                WHERE table_name = "dim_branch"
""")
# initial run
else: 
    df_final = df_final.withColumn("updated_at", lit(cur_time_str))
    df_final.write.mode("overwrite") \
        .format("delta") \
        .option("path", "abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_branch") \
        .saveAsTable("cars_catalog.gold.dim_branch")
    
    spark.sql(f"""
              INSERT INTO cars_catalog.default.metadata_table
              VALUES ("dim_branch", '{cur_time_str}')
              """)

In [0]:
# %sql
# DROP TABLE cars_catalog.gold.dim_branch;

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-625789842676848>, line 8[0m
[1;32m      4[0m [38;5;28;01mif[39;00m spark[38;5;241m.[39mcatalog[38;5;241m.[39mtableExists([38;5;124m'[39m[38;5;124mcars_catalog.gold.dim_test[39m[38;5;124m'[39m): 
[1;32m      5[0m     deltaTable [38;5;241m=[39m DeltaTable[38;5;241m.[39mforPath(spark, [38;5;124m"[39m[38;5;124mabfss://gold@cardeprojectdl.dfs.core.windows.net/dim_test[39m[38;5;124m"[39m)
[1;32m      7[0m     deltaTable[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mtar[39m[38;5;124m"[39m)[38;5;241m.[39mmerge(df_final[38;5;241m.[39malias([38;5;124m"[39m[38;5;124msrc[39m[38;5;124m"[39m), [38;5;124m"[39m[38;5;124mtar.dim_branch_key=src.dim_branch_key[39m[38;5;124m"[39m) \
[0;32m----> 8[0m                     [38;5;241m.[39mwhenMatchedUpdate([38;5;

In [0]:
%sql
SELECT * FROM cars_catalog.gold.dim_branch;

In [0]:
%sql
select * from cars_catalog.default.metadata_table

table_name,last_updated_time
dim_test,2025-05-06T08:33:00.291Z


In [0]:
# %sql
# TRUNCATE TABLE cars_catalog.default.metadata_table;