In [2]:
from pyspark.sql.functions import col, monotonically_increasing_id
from pyspark.sql.types import *
from delta.tables import DeltaTable

# Create Flag Parameter

In [None]:
dbutils.widgets.text("p_incremental_flag","")
v_incre_flag = dbutils.widgets.get("p_incremental_flag")

In [None]:
dbutils.widgets.text("p_ingestion_date","")
v_ingest_date = dbutils.widgets.get("p_ingestion_date")

# Create DIMENSIONS MODEL

### Fetch Relative Columns

In [4]:
df_src = spark.sql(f"""
SELECT DISTINCT(Date_ID) as Date_ID, ingestion_date
FROM CSV.`../../silver_table/`
WHERE ingestion_date = '{v_ingest_date}'
""")

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `ingestion_date` cannot be resolved. Did you mean one of the following? [`_c0`, `_c1`, `_c10`, `_c11`, `_c12`].; line 4 pos 6;
'Distinct
+- 'Project ['Date_ID AS Date_ID#0, 'ingestion_date]
   +- 'Filter ('ingestion_date = 2025-04-30)
      +- Relation [_c0#18,_c1#19,_c2#20,_c3#21,_c4#22,_c5#23,_c6#24,_c7#25,_c8#26,_c9#27,_c10#28,_c11#29,_c12#30,_c13#31,_c14#32] csv


In [None]:
df_src.display()

Date_ID,ingestion_date
DT01247,2025-05-01
DT01246,2025-05-01


### dim_model Sink - Initial and Incremental

In [None]:
if spark.catalog.tableExists('cars_catalog.gold.dim_date'): # incremental
    # df_sink = spark.sql('''
    #                 SELECT dim_branch_key, Branch_ID, BranchName
    #                 from PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`
    #                 ''')
    df_sink = spark.sql('''
                    SELECT *
                    FROM DELTA.`abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_date`
                    ''')

else: # initial
    df_sink = spark.sql('''
                        SELECT 1 as dim_date_key, CAST(NULL AS TIMESTAMP) as updated_at, Date_ID
                        from PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`
                        WHERE 1=0
                        ''')

In [None]:
df_sink.display()

### Filtering new records and old records

In [None]:
df_filter = df_src.join(df_sink, df_src['Date_ID'] == df_sink['Date_ID'], how='left') \
      .select(df_src['Date_ID'], df_sink['updated_at'], df_sink['dim_date_key'])

In [None]:
df_filter.display()

Date_ID,updated_at,dim_date_key
DT01247,,
DT01246,,


 **df_filter_old**

In [None]:
df_filter_old = df_filter.filter(col('dim_date_key').isNotNull())

In [None]:
df_filter_old.display()

Date_ID,updated_at,dim_date_key


 **df_filter_new**

In [None]:
df_filter_new = df_filter.filter(col('dim_date_key').isNull()).select(col('Date_ID'), col("updated_at"))
df_filter_new.display()

Date_ID,updated_at
DT01247,
DT01246,


# Create Surrogate Key

### Fetching the max Surrogate key from existing table

this Surrogate Key acts as the start point for incremental loading

In [None]:
if (v_incre_flag == '0'):
    max_value = 1
else:
    max_value_df = spark.sql("select max(dim_date_key) from cars_catalog.gold.dim_date")
    max_value = max_value_df.collect()[0][0] + 1

### Creating Surrogate Key column and ADD the max surrogate key

In [None]:
df_filter_new = df_filter_new.withColumn('dim_date_key', max_value + monotonically_increasing_id())

In [None]:
df_filter_new.display()

Date_ID,updated_at,dim_date_key
DT01247,,1157
DT01246,,1158


### Create Final DF = df_filter_old + df_filter_new

In [None]:
df_final = df_filter_new.union(df_filter_old)

In [None]:
df_final.display()

Date_ID,updated_at,dim_date_key
DT01247,,1157
DT01246,,1158


# SCD - TYPE 1 (UPSERT)

In [None]:
from pyspark.sql.functions import current_timestamp,lit

In [None]:
cur_time_str = spark.sql("SELECT current_timestamp()").collect()[0][0].strftime("%Y-%m-%d %H:%M:%S.%f")

# incremental load
if spark.catalog.tableExists('cars_catalog.gold.dim_date'): 
    deltaTable = DeltaTable.forPath(spark, "abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_date")

    deltaTable.alias("tar").merge(df_final.alias("src"), "tar.dim_date_key=src.dim_date_key") \
                    .whenMatchedUpdate(set={
                        "Date_ID": "src.Date_ID",
                        "updated_at": f"'{cur_time_str}'"
                        
                    }
                        
                    ) \
                    .whenNotMatchedInsert(values={
                        "Date_ID": "src.Date_ID",
                        "dim_date_key": "src.dim_date_key",
                        "updated_at": f"'{cur_time_str}'"
                    }) \
                    .execute()
    spark.sql(f"""
                UPDATE cars_catalog.default.metadata_table
                SET last_updated_time = '{cur_time_str}'
                WHERE table_name = "dim_date"
""")
# initial run
else: 
    df_final = df_final.withColumn("updated_at", lit(cur_time_str))
    df_final.write.mode("overwrite") \
        .format("delta") \
        .option("path", "abfss://gold@cardeprojectdl.dfs.core.windows.net/dim_date") \
        .saveAsTable("cars_catalog.gold.dim_date")
    
    spark.sql(f"""
              INSERT INTO cars_catalog.default.metadata_table
              VALUES ("dim_date", '{cur_time_str}')
              """)

In [None]:
# %sql
# DROP TABLE cars_catalog.gold.dim_date;

In [None]:
%sql
SELECT * FROM cars_catalog.gold.dim_date;

In [None]:
%sql
select * from cars_catalog.default.metadata_table

table_name,last_updated_time
dim_date,2025-05-06T14:17:50.197Z


In [None]:
# %sql
# TRUNCATE TABLE cars_catalog.default.metadata_table;