# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col

# import trimming function
from script.silver.silver_utils import trim_string_columns

In [0]:
# Read Bronze table

In [0]:
df = spark.table("workspace.bronze.erp_cust_az12")

# Silver Transformations

## Trimming

In [0]:
df = trim_string_columns(df)

## Customer ID Cleanup

In [0]:
df = df.withColumn(
  "cid",
  F.when(col("cid").startswith("NAS"),
         F.substring(col("cid"), 4, F.length(col("cid"))))
    .otherwise(col("cid"))
)

## Birthdate Validation

In [0]:
df = df.withColumn(
    "bdate",
    F.when(col("bdate") > F.current_date(),
           None)
        .otherwise(col("bdate"))
)

## Gender Normalization

In [0]:
df = df.withColumn(
    "gen",
    F.when(F.upper(col("gen")).isin("F", "FEMALE"), "Female")
     .when(F.upper(col("gen")).isin("M", "MALE"), "Male")
     .otherwise("n/a")
)

## Renaming Columns

In [0]:
rename_map = {
  "cid": "customer_number",
  "bdate": "birth_date",
  "gen": "gender"
  }

for old_name, new_name in rename_map.items():
      df = df.withColumnRenamed(old_name, new_name)    

## Sanity checks of dataframe

In [0]:
df.limit(10).display()

In [0]:
# finds n/a birth dates and if any future birth dates were not captured in birthdate validation rule

display(
  df.select(
    F.count(F.when(F.col("birth_date").isNull(), True)).alias("null_birth_date_count"),
    F.count(F.when(F.col("birth_date") > F.current_date(), True)).alias("future_birth_date_count")
  )
)

# Writing Silver Table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customers")

## Sanity checks of silver table

In [0]:
%sql
SELECT *
FROM workspace.silver.erp_customers
LIMIT 10;