# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col

# import trimming function
from script.silver.silver_utils import trim_string_columns

# Read Bronze table

In [0]:
df = spark.table("workspace.bronze.erp_loc_a101")

# Silver Transformations

## Trimming

In [0]:
df = trim_string_columns(df)

## Customer ID Cleanup

In [0]:
df = df.withColumn("cid", F.regexp_replace(col("cid"), "-", ""))

In [0]:
df.limit(10).display()

## Country Normalization

In [0]:
df = df.withColumn(
  "cntry",
  F.when(col("cntry") == "DE", "Germany")
   .when(col("cntry").isin("US", "USA"), "United States")
   .when((col("cntry") == "") | col("cntry").isNull(), "n/a")
   .otherwise(col("cntry"))
)

## Renaming Columns

In [0]:
rename_map = {
    "cid": "customer_number",
    "cntry": "country"
}

for old_name, new_name in rename_map.items():
    df = df.withColumnRenamed(old_name, new_name)

## Sanity checks of dataframe

In [0]:
df.limit(10).display()

In [0]:
# check for country names and counts, nulls or empty spaces
display(
    df.groupBy("country")
      .agg(
          F.count("*").alias("row_count"),
          F.sum(F.when(col("country").isNull(), 1).otherwise(0)).alias("null_count"),
          F.sum(F.when(col("country") == "", 1).otherwise(0)).alias("empty_count")
      )
)

# Writing Silver Table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customer_location")

## Sanity checks of silver table

In [0]:
%sql
SELECT *
FROM workspace.silver.erp_customer_location
LIMIT 10;