#Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col

# Read Bronze table

In [0]:
df = spark.table("workspace.bronze.crm_cust_info")

# Silver Transformations

## Trimming

In [0]:
for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

## Normalization

In [0]:
df = (
    df
    .withColumn(
        "cst_marital_status",
        F.when(F.upper(F.col
        ("cst_marital_status")) == "S",
        "Single")
        .when(F.upper(F.col
        ("cst_marital_status")) == "M",
        "Maried")
        .otherwise("n/a")
    )
    .withColumn(
        "cst_gndr",
        F.when(F.upper(F.col
        ("cst_gndr")) == "F",
        "Female")
        .when(F.upper(F.col
        ("cst_gndr")) == "M",
        "Male")
        .otherwise("n/a")
    )
)

## Remove Records with Missing Customer ID

In [0]:
df = df.filter(col("cst_id").isNotNull())

## Renaming Columns

In [0]:
rename_map = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "created_date"
}

for old_name, new_name in rename_map.items():
    df = df.withColumnRenamed(old_name, new_name)

## Sanity checks of dataframe

In [0]:
df.limit(10).display()

In [0]:
# should return no Nulls
df.filter(col("customer_id").isNull()).display()

In [0]:
# value counts for normalised attributes
display(
    df.groupBy("marital_status", "gender")
      .count()
      .orderBy("marital_status", "gender")
)

# Writing Silver table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.crm_customers")

## Sanity checks of silver table

In [0]:
%sql
SELECT *
FROM workspace.silver.crm_customers
LIMIT 10;