##Bronze to Silver

#### 04: CUST AZ12

#####Initialize

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import when, col, row_number
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

##### Create Dataframe 

In [0]:
df = spark.table("dlh.bronze_db.bronze_cust_az12")
df.display()
df.count()

In [0]:
df.filter(col('CID').startswith('AW00')).count()

#####Data Standardisation

In [0]:
df1 = df.withColumn("CID", 
                    F.when(col("CID").startswith("NAS"), F.substring(col("CID"),4,F.length(col("CID"))) )
                    .otherwise(col("CID")))
df1.display()

In [0]:
df1.filter(~col("CID").startswith("AW00")).display()

In [0]:
df1.select(col("GEN")).distinct().display()

#####Data Normalization

In [0]:
df2 = df1.withColumn("GEN",
                     F.when(col("GEN").isin("M", "Male"), "Male")
                     .when(col("GEN").isin("F", "Female"), "Female")
                     .otherwise("n/a")
)     

In [0]:
df2.select(col("GEN")).distinct().display()

#####Trim Function

In [0]:
def trimmed(df: DataFrame) -> DataFrame:
    for field in df.schema.fields:
        if isinstance(field.dataType, StringType):
            df = df.withColumn(field.name, F.trim(F.col(field.name)))
    return df

In [0]:
df3 = trimmed(df2)
df3.display()

#####Rename Function

In [0]:
RENAME_MAP = {
    "cid": "customer_number",
    "bdate": "birth_date",
    "gen": "gender"
}

def renamed(df: DataFrame) -> DataFrame:
    for old_name, new_name in RENAME_MAP.items():
        df = df.withColumnRenamed(old_name,new_name)
    return df

In [0]:
df4 =renamed(df3)

In [0]:
df5 = df4.drop("file_path", "ingest_ts").withColumn("ingest_ts", F.current_timestamp())

#####Write to Table

In [0]:
spark.sql("DROP TABLE IF EXISTS dlh.silver_db.silver_cust_az12")
(   df5.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("dlh.silver_db.silver_cust_az12")  
)