##Bronze to Silver Layer

### 01 Customer Information

%md
####Data Exploration
`Detect and identify the Quality Issues`
- `1. Check for the Dupliates in the Primary Key`
- `2. Check Nulls in the Primary Key`
- `3. Check Blanks in the Primary Key`

`Unit Test to confirm`
`- 1. No Duplicate, Null, Blank "" in the Primary Key Column`

#####Initilization

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, row_number
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

#####Read from Bronze Table and create a New DataFrame

In [0]:
df = spark.table('dlh.bronze_db.bronze_cust_info')

#####Filter Null Primary Key

`Use Window: RANK function  Row_number() to fetch the valid customer_id based on the latest Date`

In [0]:
ws = Window.partitionBy("cst_id").orderBy(col("cst_create_date").desc())

df1 = (df.withColumn("date_rn" , F.row_number().over(ws))
                .filter((col("date_rn") == 1) & (col("cst_id").isNotNull()))
                
)

In [0]:
df1.count()

18484

##### Null Validation

In [0]:
df_check =(
        df1
        .groupBy(col("cst_id"))
        .agg(F.count("*").alias("id_count"))
        .filter( (F.col("id_count")> 1) |  (F.col("cst_id").isNull()) )
    )
df_check.count()

0

In [0]:
df_check =(
        df
        .groupBy(col("cst_id"))
        .agg(F.count("*").alias("id_count"))
        .filter( (F.col("id_count")> 1) |  (F.col("cst_id").isNull()) )
    )
df_check.display()

cst_id,id_count
29433.0,2
,4
29449.0,2
29466.0,3
29473.0,2
29483.0,2


#####Trim: Function

In [0]:

def trimmed(df: DataFrame) -> DataFrame:
    for field in df.schema.fields:
        if isinstance(field.dataType,StringType):
            df = df.withColumn(field.name,F.trim(F.col(field.name)))
    return df

In [0]:
df2 = trimmed(df1)
df2.count()

18484

##### Trim Validation

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import  StringType
from pyspark.sql import DataFrame

df2.filter((col("cst_firstname")) != F.trim(col("cst_firstname")))
df2.filter((col("cst_lastname")) != F.trim(col("cst_lastname")))
df2.filter((col("cst_key")) != F.trim(col("cst_key")))
df2.filter((col("cst_marital_status")) != F.trim(col("cst_marital_status")))   
df2.filter((col("cst_gndr")) != F.trim(col("cst_gndr")))       

DataFrame[cst_id: int, cst_key: string, cst_firstname: string, cst_lastname: string, cst_marital_status: string, cst_gndr: string, cst_create_date: date, ingest_ts: timestamp, date_rn: int]

#####Optional

In [0]:
def columnToString(df: DataFrame) -> DataFrame:
    for field in df.schema.fields:
        df = df.withColumn(field.name,F.col(field.name).cast(StringType()))
    return df

#### Data Standardization and Consistency


In [0]:
df3  = (df2
        .withColumn("cst_marital_status", 
                    F.when(col("cst_marital_status") == "S", "Single")
                     .when(col("cst_marital_status") == "M", "Married")
                     .otherwise("Unknown"))
        .withColumn("cst_gndr", 
                    F.when(col("cst_gndr") == "M", "Male")
                     .when(col("cst_gndr") == "F", "Female")
                     .otherwise("Unknown")))


In [0]:
df3.display()

cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date,ingest_ts,date_rn
11000,AW00011000,Jon,Yang,Married,Male,2025-10-06,2026-02-05T12:30:51.709Z,1
11001,AW00011001,Eugene,Huang,Single,Male,2025-10-06,2026-02-05T12:30:51.709Z,1
11002,AW00011002,Ruben,Torres,Married,Male,2025-10-06,2026-02-05T12:30:51.709Z,1
11003,AW00011003,Christy,Zhu,Single,Female,2025-10-06,2026-02-05T12:30:51.709Z,1
11004,AW00011004,Elizabeth,Johnson,Single,Female,2025-10-06,2026-02-05T12:30:51.709Z,1
11005,AW00011005,Julio,Ruiz,Single,Male,2025-10-06,2026-02-05T12:30:51.709Z,1
11006,AW00011006,Janet,Alvarez,Single,Female,2025-10-06,2026-02-05T12:30:51.709Z,1
11007,AW00011007,Marco,Mehta,Married,Male,2025-10-06,2026-02-05T12:30:51.709Z,1
11008,AW00011008,Rob,Verhoff,Single,Female,2025-10-06,2026-02-05T12:30:51.709Z,1
11009,AW00011009,Shannon,Carlson,Single,Male,2025-10-06,2026-02-05T12:30:51.709Z,1


#####Rename: Function

In [0]:

RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "created_date"
}

def renamed(df: DataFrame) -> DataFrame:
    for old_name,new_name in RENAME_MAP.items():
        df =df.withColumnRenamed(old_name, new_name)
    return df   

In [0]:
df4 = renamed(df3)

In [0]:
df5 = df4.drop("ingest_ts").withColumn("ingest_ts", F.current_timestamp())

In [0]:
df5.count()

18484

##### Write Customer Information from DF to Silver_cust_info as Delta table 

In [0]:
spark.sql("DROP TABLE IF EXISTS dlh.silver_db.silver_cust_info ")

df6 = (df5
            .write
            .mode("overwrite")
            .format("delta")
            .saveAsTable("dlh.silver_db.silver_cust_info")
)