Loading the table 

In [0]:
df_silver = spark.table("workspace.bronze.crm_cust_info")

Exploring the data

In [0]:
df_silver.display()

Explore the data 


In [0]:
df_silver.printSchema()

In [0]:
# Gives you the count (non-nulls), mean, stddev, min, and max for all columns
df_silver.summary().show()

# Pro-Tip: If you ONLY want the non-null count to save processing time:
df_silver.summary("count").show()

In [0]:
#We found duplicates in the customer_id 
from pyspark.sql.functions import col, count
df_explore_duplicates = (
    df_silver
    .filter(col("cst_id").isNotNull()) 
    .select("*") 
    .groupBy("cst_id")
    .agg(count("*").alias("occurrence_count"))
    .filter(col("occurrence_count") > 1)
)
df_explore_duplicates.display()


Data Cleaning

In [0]:
#-------Removing the duplicated values in customer_id-------
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number


window_spec = Window.partitionBy("cst_id").orderBy(col("cst_create_date").desc())


df_silver = (
    df_silver
    
    .withColumn("flag_last", row_number().over(window_spec))
    
    
    .filter((col("flag_last") == 1)&(col("cst_id").isNotNull()))
)


df_silver.display()

In [0]:
df_silver.summary("count").show()

Triming string columns

In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim
for field in df_silver.schema.fields:
    if isinstance(field.dataType, StringType):
        df_silver = df_silver.withColumn(field.name, trim(col(field.name)))


In [0]:
#Convert one letter to word to be more understandable
import pyspark.sql.functions as F
df_silver = (
    df_silver
    .withColumn(
        "cst_marital_status",
        F.when(F.upper(F.col("cst_marital_status")) == "S", "Single")
         .when(F.upper(F.col("cst_marital_status")) == "M", "Married")
         .otherwise("n/a")
    )
    .withColumn(
        "cst_gndr",
        F.when(F.upper(F.col("cst_gndr")) == "F", "Female")
         .when(F.upper(F.col("cst_gndr")) == "M", "Male")
         .otherwise("n/a")
    )
)

Renaming the columns

In [0]:
RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "created_date"
}
for old_name, new_name in RENAME_MAP.items():
    df_silver = df_silver.withColumnRenamed(old_name, new_name)

In [0]:
df_silver.summary("count").show()

In [0]:
(
    df_silver.write
    .mode("overwrite")
    .format("delta")
    .option("overwriteSchema", "true")
    .saveAsTable("workspace.silver.crm_customers")
)