## Silver to Gold Transformation

#####Dimension **Customer**

#####Initialization

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, coalesce

#####Create Dataframe

In [0]:
df_ci = spark.table('dlh.silver_db.silver_cust_info')

In [0]:
df_ca = spark.table('dlh.silver_db.silver_cust_az12')
df_la = spark.table('dlh.silver_db.silver_loc_a101')

In [0]:
df_cust = (
        df_ci.alias("ci")
           .join(df_ca.alias("ca"),
                col("ci.customer_number") == col("ca.customer_number"), "left")
           .join(df_la.alias("la"),
                 col("ci.customer_number") == col("la.customer_number"), "left")
           .select(
               col("ci.customer_id"),
               col("ci.customer_number"),
               col("ci.first_name"),
               col("ci.last_name"),
               col("ci.marital_status"),
               col("ci.gender"),
               col("ci.created_date"),
               col("ca.birth_date"),
               col("ca.gender"),
               col("la.country")
           )
           )

df_cust.limit(20).display()

customer_id,customer_number,first_name,last_name,marital_status,gender,created_date,birth_date,gender.1,country
11000,AW00011000,Jon,Yang,Married,Male,2025-10-06,1971-10-06,Male,Australia
11001,AW00011001,Eugene,Huang,Single,Male,2025-10-06,1976-05-10,Male,Australia
11002,AW00011002,Ruben,Torres,Married,Male,2025-10-06,1971-02-09,Male,Australia
11003,AW00011003,Christy,Zhu,Single,Female,2025-10-06,1973-08-14,Female,Australia
11004,AW00011004,Elizabeth,Johnson,Single,Female,2025-10-06,1979-08-05,Female,Australia
11005,AW00011005,Julio,Ruiz,Single,Male,2025-10-06,1976-08-01,Male,Australia
11006,AW00011006,Janet,Alvarez,Single,Female,2025-10-06,1976-12-02,Female,Australia
11007,AW00011007,Marco,Mehta,Married,Male,2025-10-06,1969-11-06,Male,Australia
11008,AW00011008,Rob,Verhoff,Single,Female,2025-10-06,1975-07-04,Female,Australia
11009,AW00011009,Shannon,Carlson,Single,Male,2025-10-06,1969-09-29,Male,Australia


In [0]:
df_cust.filter(
               col("ci.customer_number").isNull()
              
           ).display()

customer_id,customer_number,first_name,last_name,marital_status,gender,created_date,birth_date,gender.1,country


In [0]:
df_cust.select("ci.gender","ca.gender").distinct().orderBy(col("ci.gender").desc()).display()

gender,gender.1
Unknown,
Unknown,Male
Unknown,Female
Male,Female
Male,Male
Male,
Female,
Female,Female
Female,Male


In [0]:
df_integration = ( df_cust
                    .withColumn("customer_gender", 
                                F.when(col("ci.gender") == 'Unknown', col("ca.gender"))
                                .otherwise(col("ci.gender"))
                   
                                )
                     
)

In [0]:
df1_cleaned = df_integration.drop(col("ci.gender"),col("ca.gender"))

In [0]:
df1_cleaned.filter(col("birth_date").isNull()).display()

customer_id,customer_number,first_name,last_name,marital_status,created_date,birth_date,country,customer_gender


In [0]:
from pyspark.sql.window import Window

window_spec = Window.orderBy("customer_id")

df1 = df1_cleaned.withColumn("customer_key", F.row_number().over(window_spec))

df2 = (df1.select( col("customer_key"),
           col("customer_id"),
           col("customer_number"),
           col("first_name"),
           col("last_name"),
           col("country"),
           col("marital_status"),
           col("customer_gender"),
           col("birth_date"),
           col("created_date")
           )
 
)





In [0]:
spark.sql("DROP TABLE IF EXISTS dlh.gold_db.gold_dim_customers")
df2.write.mode("overwrite").format("delta").saveAsTable("dlh.gold_db.gold_dim_customers")

