In [0]:
# Necessary catalog and schema 
spark.sql("USE CATALOG finance_catalog")
spark.sql("USE finance_catalog.bronze")

# Upload bronze tables
df_tx_bronze = spark.table("transactions")
df_cust_bronze = spark.table("customers")

display(df_tx_bronze.limit(5))
display(df_cust_bronze.limit(5))


transaction_id,customer_id,timestamp,origin_country,destination_country,amount,currency,merchant
TXN000001,CUST2132,2025-06-23 15:44:56,SG,NL,17415.01,USD,"Hernandez, Spears and Barnes"
TXN000002,CUST7351,2025-07-10 07:47:38,PE,UY,18087.38,USD,Harris Group
TXN000003,CUST5966,2025-03-26 22:23:20,KH,US,10468.63,USD,Yoder-Lewis
TXN000004,CUST1574,2025-05-29 05:08:34,BR,GR,533.68,USD,"Anderson, Schmitt and Bates"
TXN000005,CUST6664,2025-05-10 15:32:16,LA,AO,3941.5,USD,"Elliott, Dawson and Evans"


customer_id,name,birth_date,country,risk_segment
CUST1019,Travis Anderson,1984-12-27,HU,medium
CUST1020,Tyler Davis,1945-05-10,NR,high
CUST1062,Lisa Smith,1984-01-29,NE,high
CUST1063,Thomas Brown,1997-08-30,TG,low
CUST1072,James Marsh,1956-10-02,EG,low


In [0]:
from pyspark.sql.functions import col, to_timestamp, trim, upper

df_tx_silver = (
    df_tx_bronze
      # tipos correctos
      .withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy-MM-dd HH:mm:ss"))
      .withColumn("amount",    col("amount").cast("double"))
      # normalización suave
      .withColumn("origin_country",      upper(trim(col("origin_country"))))
      .withColumn("destination_country", upper(trim(col("destination_country"))))
      .withColumn("currency",            upper(trim(col("currency"))))
      .withColumn("merchant",            trim(col("merchant")))
      # calidad mínima
      .dropna(subset=["transaction_id", "customer_id", "timestamp", "amount"])
)

display(df_tx_silver.limit(5))


transaction_id,customer_id,timestamp,origin_country,destination_country,amount,currency,merchant
TXN000001,CUST2132,2025-06-23T15:44:56.000Z,SG,NL,17415.01,USD,"Hernandez, Spears and Barnes"
TXN000002,CUST7351,2025-07-10T07:47:38.000Z,PE,UY,18087.38,USD,Harris Group
TXN000003,CUST5966,2025-03-26T22:23:20.000Z,KH,US,10468.63,USD,Yoder-Lewis
TXN000004,CUST1574,2025-05-29T05:08:34.000Z,BR,GR,533.68,USD,"Anderson, Schmitt and Bates"
TXN000005,CUST6664,2025-05-10T15:32:16.000Z,LA,AO,3941.5,USD,"Elliott, Dawson and Evans"


In [0]:
from pyspark.sql.functions import to_date, lower

df_cust_silver = (
    df_cust_bronze
      .withColumn("birth_date", to_date(col("birth_date"), "yyyy-MM-dd"))
      .withColumn("country",    upper(trim(col("country"))))
      .withColumn("risk_segment", lower(trim(col("risk_segment"))))  # normalize to lower: low/medium/high
      .dropna(subset=["customer_id", "name"])                        
)

display(df_cust_silver.limit(5))


customer_id,name,birth_date,country,risk_segment
CUST1019,Travis Anderson,1984-12-27,HU,medium
CUST1020,Tyler Davis,1945-05-10,NR,high
CUST1062,Lisa Smith,1984-01-29,NE,high
CUST1063,Thomas Brown,1997-08-30,TG,low
CUST1072,James Marsh,1956-10-02,EG,low


In [0]:
# Change to the silver schema and save the work
spark.sql("USE finance_catalog.silver")

# Save as Delta (overwrite to this lab)
df_tx_silver.write.format("delta").mode("overwrite").saveAsTable("finance_catalog.silver.transactions")
df_cust_silver.write.format("delta").mode("overwrite").saveAsTable("finance_catalog.silver.customers")


In [0]:
%sql

SHOW TABLES in finance_catalog.silver;
SELECT * FROM finance_catalog.silver.transactions LIMIT 3;
SELECT * FROM finance_catalog.silver.customers  LIMIT 3;

customer_id,name,birth_date,country,risk_segment
CUST1019,Travis Anderson,1984-12-27,HU,medium
CUST1020,Tyler Davis,1945-05-10,NR,high
CUST1062,Lisa Smith,1984-01-29,NE,high


In [0]:
%sql
SELECT COUNT(*) n_null_amount FROM finance_catalog.silver.transactions WHERE amount IS NULL;
SELECT MIN(timestamp) min_ts, MAX(timestamp) max_ts FROM finance_catalog.silver.transactions;

min_ts,max_ts
2025-01-01T00:06:55.000Z,2025-09-12T14:59:03.000Z
