# Silver Layer: Data cleaning

Read from bronze tables, apply cleaning (trim, cast, dedup, null handling), write to `development.silver`.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()
catalog = "development"
schema_silver = "silver"

# 1) Population: clean and write to development.silver.datausa_population

In [None]:
df_population  = spark.table(f"{catalog}.bronze.datausa_population_raw")
# Clean: trim strings, cast types, drop nulls in key columns, dedup
df_pop_clean = (
    df_population
    .select(
        F.trim(F.col("Nation ID")).alias("nation_id"),
        F.trim(F.col("Nation")).alias("nation"),
        F.col("Year").cast("int").alias("year"),
        F.col("Population").cast("double").alias("population"),
    )
    .dropDuplicates(["nation_id", "year"])
    .filter(F.col("year").isNotNull() & F.col("population").isNotNull())
)
df_pop_clean.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema_silver}.datausa_population")
print(f"Created {catalog}.{schema_silver}.datausa_population")

# 2) BLS time series: clean and write to development.silver.bls_pr_data

In [None]:
df_bls = spark.table(f"{catalog}.bronze.bls_pr_data_raw")
# Clean: trim strings, cast types, dedup (same as bronze cleaning logic, applied here for silver)
for c in df_bls.columns:
    df_bls = df_bls.withColumnRenamed(c, c.strip())
df_bls_clean = (
    df_bls
    .select(
        F.trim(F.col("series_id")).alias("series_id"),
        F.col("year").cast("int"),
        F.trim(F.col("period")).alias("period"),
        F.col("value").cast("double"),
        F.trim(F.col("footnote_codes")).alias("footnote_codes"),
    )
    .dropDuplicates(["series_id", "year", "period"])
    .filter(F.col("series_id").isNotNull() & F.col("year").isNotNull() & F.col("value").isNotNull())
)
df_bls_clean.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema_silver}.bls_pr_data")
print(f"Created {catalog}.{schema_silver}.bls_pr_data")