In [0]:
# 02_Silver_Transformation.py - Reads Bronze, applies DQ, and saves to Silver Layer

from pyspark.sql.functions import col, substring, trim
from pyspark.sql.types import IntegerType

# Persistence Configuration (MUST MATCH 01_Bronze and 03_Gold)
CATALOG_NAME = "workspace" 
SCHEMA_NAME = "default"
BRONZE_TABLE_NAME = "omdb_releases_bronze"
SILVER_TABLE_NAME = "omdb_releases_silver"
FULL_BRONZE_PATH = f"{CATALOG_NAME}.{SCHEMA_NAME}.{BRONZE_TABLE_NAME}"
FULL_SILVER_PATH = f"{CATALOG_NAME}.{SCHEMA_NAME}.{SILVER_TABLE_NAME}"

print(f"--- 1. READING THE BRONZE LAYER from: {FULL_BRONZE_PATH} ---")

# --- 1. LOAD (READS BRONZE) ---
# Reads the data that was persisted by 01_Bronze_Ingestion.py
try:
    df_bronze = spark.read.table(FULL_BRONZE_PATH)
except Exception as e:
    print(f"ERROR: Could not read Bronze table '{FULL_BRONZE_PATH}'.")
    print(f"Ensure 01_Bronze_Ingestion.py ran successfully. Error details: {e}")
    dbutils.notebook.exit("Failed to read Bronze table.")


print(f"--- 2. TRANSFORMING TO SILVER (Data Quality) ---")

# --- 2. TRANSFORMATION (SILVER) ---
df_silver = df_bronze.select(
    # Renaming columns for clarity
    col("Title").alias("movie_title"),
    col("imdbID").alias("imdb_id"),
    
    # Data Quality (DQ): Extract the first 4 digits of the year and cast to Integer
    substring(trim(col("Year")), 1, 4).cast(IntegerType()).alias("release_year"),
    
    # Standardization: Trim whitespace from the Type
    trim(col("Type")).alias("media_type"),
    
    # Carry over audit columns
    col("ingestion_timestamp"),
    col("search_query")
).filter(
    # Quality Filter: Ensures the year conversion was successful and is not null
    col("release_year").isNotNull()
)

print(f"Silver DataFrame ready with {df_silver.count()} clean records.")

# --- 3. PERSISTENCE (SAVES SILVER) ---
# Overwrite the Silver table with the newly cleaned data
df_silver.write \
         .format("delta") \
         .mode("overwrite") \
         .option("overwriteSchema", "true") \
         .saveAsTable(FULL_SILVER_PATH)

print(f"Silver Table '{FULL_SILVER_PATH}' saved successfully.")