In [0]:
from pyspark.sql.functions import col

In [0]:
dbutils.widgets.text("p_ingestion_date","")
v_ingest_date = dbutils.widgets.get("p_ingestion_date")

# CREATE FACT TABLE

**Reading Silver data**

In [0]:

df_silver = spark.sql("SELECT * FROM PARQUET.`abfss://silver@cardeprojectdl.dfs.core.windows.net/carsales`") \
                .filter(col('ingestion_date') == v_ingest_date)

In [0]:
df_silver.display()

**Reading all the DIMS**

In [0]:
df_dealer = spark.sql("SELECT * FROM cars_catalog.gold.dim_dealer")
df_model = spark.sql("SELECT * FROM cars_catalog.gold.dim_model")
df_branch = spark.sql("SELECT * FROM cars_catalog.gold.dim_branch")
df_date = spark.sql("SELECT * FROM cars_catalog.gold.dim_date")

**Bringing Keys to the Fact table**

In [0]:
df_fact = df_silver.join(df_branch, df_silver['Branch_ID'] == df_branch['Branch_ID'],how="left") \
                .join(df_dealer, df_silver['Dealer_ID'] == df_dealer['Dealer_ID'],how="left") \
                .join(df_model, df_silver['Model_ID'] == df_model['Model_ID'],how="left") \
                .join(df_date, df_silver['Date_ID'] == df_date['Date_ID'],how="left") \
                .select(df_silver['Revenue'], df_silver['Units_Sold'], df_silver['RevPerUnit'], df_branch['dim_branch_key'], df_dealer['dim_dealer_key'], df_model['dim_model_key'], df_date['dim_date_key'])

In [0]:
df_fact.display()

In [0]:
# from pyspark.sql import functions as F
# from pyspark.sql.types import IntegerType, StringType, DoubleType, DateType, StructType, StructField
# import random
# from datetime import datetime

# # Define schema
# schema = StructType([
  
#     StructField("Revenue", IntegerType(), True),
#     StructField("Units_Sold", IntegerType(), True),
#     StructField("RevPerUnit", IntegerType(), True),
#     StructField("dim_date_key", IntegerType(), True),
#     StructField("dim_dealer_key", IntegerType(), True),
#     StructField("dim_model_key", IntegerType(), True),
#     StructField("dim_branch_key", IntegerType(), True),
    
# ])

# # Generate random data
# data = [
#     (
#         random.randint(1, 100),  # Branch_ID
#         random.randint(1, 100),  # Dealer_ID
#         random.randint(1, 100),  # Model_ID
#         random.randint(1, 100),  # Units_Sold
#         random.randint(1, 100),  # Units_Sold
#         random.randint(1, 100),  # Branch_ID
#         random.randint(1, 100)  # Branch_ID


#     )
#     for _ in range(10)
# ]

# # Create DataFrame
# df_fact = spark.createDataFrame(data, schema)

# # Display the DataFrame
# display(df_fact)

Revenue,Units_Sold,RevPerUnit,dim_date_key,dim_dealer_key,dim_model_key,dim_branch_key
40,47,76,70,82,82,87
96,7,35,81,87,66,25
99,45,7,75,54,26,56
25,54,95,13,65,83,74
12,66,58,94,67,20,67
83,88,19,88,22,54,6
83,30,17,31,69,66,99
32,3,33,54,90,64,63
2,49,52,75,69,13,85
62,33,61,52,61,81,76


### Writing Fact table

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp,lit

In [0]:
# %sql
# DROP TABLE cars_catalog.gold.fact_sales

In [0]:
cur_time_str = spark.sql("SELECT current_timestamp()").collect()[0][0].strftime("%Y-%m-%d %H:%M:%S.%f")

if spark.catalog.tableExists("cars_catalog.gold.fact_sales"):
    delta_table = DeltaTable.forName(spark, "cars_catalog.gold.fact_sales")
    delta_table.alias('tgt').merge(df_fact.alias('src'), 'tgt.dim_model_key = src.dim_model_key and tgt.dim_dealer_key = src.dim_dealer_key and tgt.dim_branch_key = src.dim_branch_key and tgt.dim_date_key = src.dim_date_key')\
            .whenMatchedUpdate(set= {
                "Revenue":"src.Revenue",
                "Units_Sold" : "src.Units_Sold",
                "RevPerUnit" : "src.RevPerUnit",
                "updated_at" : f"'{cur_time_str}'"

            }) \
            .whenNotMatchedInsert(values = {
                "Revenue":"src.Revenue",
                "Units_Sold" : "src.Units_Sold",
                "RevPerUnit" : "src.RevPerUnit",
                "updated_at" : f"'{cur_time_str}'",
                "dim_branch_key" : "src.dim_branch_key",
                "dim_dealer_key" : "src.dim_dealer_key",
                "dim_model_key" : "src.dim_model_key",
                "dim_date_key" : "src.dim_date_key"
            }) \
            .execute()

    spark.sql(f"""
            UPDATE cars_catalog.default.metadata_table
            SET last_updated_time = '{cur_time_str}'
            WHERE table_name = "fact_sales"       
            """)
else:
    df_fact = df_fact.withColumn("updated_at", lit(cur_time_str))
    df_fact.write.mode("overwrite").format("delta").option("path","abfss://gold@cardeprojectdl.dfs.core.windows.net/fact_sales").saveAsTable('cars_catalog.gold.fact_sales')

    spark.sql(f"""
              INSERT INTO cars_catalog.default.metadata_table
              VALUES ("fact_sales", '{cur_time_str}')
              """)

In [0]:
%sql
SELECT * 
FROM cars_catalog.gold.fact_sales;

Revenue,Units_Sold,RevPerUnit,dim_date_key,dim_dealer_key,dim_model_key,dim_branch_key,updated_at
83,30,17,31,69,66,99,2025-05-06 10:58:50.530000
32,3,33,54,90,64,63,2025-05-06 10:58:50.530000
2,49,52,75,69,13,85,2025-05-06 10:58:50.530000
62,33,61,52,61,81,76,2025-05-06 10:58:50.530000
95,24,94,27,62,57,89,2025-05-06 10:57:01.617000
6,69,1,100,58,91,83,2025-05-06 10:57:01.617000
23,72,63,15,93,76,84,2025-05-06 10:57:01.617000
30,65,9,47,69,81,42,2025-05-06 10:57:01.617000
39,33,97,8,72,78,65,2025-05-06 10:57:01.617000
83,5,64,79,98,40,67,2025-05-06 10:57:01.617000
