### Silver Layer Customers

This section covers the transformation and validation steps to create the silver layer customers table, ensuring standardized, enriched, and high-quality customer data for analytics.

In [0]:
#Setup and read
from pyspark.sql.functions import (
    col, upper, trim, current_timestamp, lit, when,
    regexp_replace, length, coalesce, concat_ws, 
    year, datediff, to_date
)
from pyspark.sql.types import IntegerType, StringType

customers_bronze = spark.table("bronze.customers")
total_customers_bronze = customers_bronze.count()
unique_customers_bronze = customers_bronze.select("customer_id").distinct().count()
print(f"Total rows: {customers_bronze.count()}")
print(f"Null customer_id : {customers_bronze.filter(col("customer_id").isNull()).count()}")
print(f"Duplicates customer_id : {customers_bronze.groupBy("customer_id").count().filter(col("count") > 1).count()}")
if total_customers_bronze == unique_customers_bronze:
    print("All customer_id are unique")
else:
    print("customer_id are not unique")


print("Creating silver database")
spark.sql("CREATE DATABASE IF NOT EXISTS silver")
print(" Silver database ready\n")

customers_bronze = spark.table("bronze.customers")

print(f"Total customer: {customers_bronze.count()}")
display(customers_bronze.limit(5))

In [0]:
#Standarizartion of the text fields

customers_clean = customers_bronze \
    .withColumn("customer_state", upper(trim(col("customer_state")))) \
    .withColumn("customer_city", upper(trim(col("customer_city"))))

display(customers_clean.limit(5))

In [0]:
#Enrichement of the geographical data
customers_enriched = customers_clean \
    .withColumn("region",
        when(col("customer_state").isin(["SP", "RJ", "MG", "ES"]), "SUDESTE")
        .when(col("customer_state").isin(["RS", "SC", "PR"]), "SUL")
        .when(col("customer_state").isin(["BA", "SE", "AL", "PE", "CE"]), "NORDESTE")
        .when(col("customer_state").isin(["GO", "DF", "MT", "MS"]), "CENTRO_OESTE")
        .otherwise("NORTE"))

print(f"After transformations: {customers_enriched.count()}")
display(customers_enriched.select("customer_city", "customer_state", "region").limit(10))

In [0]:
customers_validated = customers_enriched \
    .filter(col("customer_state").isNotNull()) \
    .filter(col("customer_city").rlike("^[A-Z ]+$")) \
    .filter(length(col("customer_zip_code_prefix")) == 5)

print(f"Before validation: {customers_enriched.count()}")
print(f"After validation: {customers_validated.count()}")
print(f"Rows eliminated: {customers_enriched.count() - customers_validated.count()}")



In [0]:
customers_final = customers_validated \
    .withColumn("processingdate", current_timestamp()) \
    .withColumn("data_source", lit("olist")) \
    .withColumn("data_layer", lit("silver")) \
    .withColumn("data_status",lit("cleaned"))


customers_final.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver.customers")


print("Silver customers table is now created")
print(f"Total rows: {spark.table('silver.customers').count()}")
