In [None]:
%run ../config/load_config

In [None]:
%run ../common/transformations

In [None]:
%run ../common/data_quality

In [None]:
from pyspark.sql.functions import *

target_table = "lines_sv"
silver_table_path = get_storage_path("silver", target_table)

# Ensure silver table exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog}.{schema_silver}.{target_table}(
    line_id STRING,
    service_type STRING,
    severity_code BIGINT,
    severity_description STRING,
    disruption_category STRING,
    disruption_description STRING,
    disruption_from_date TIMESTAMP,
    disruption_to_date TIMESTAMP,
    is_service_disrupted BOOLEAN,
    event_timestamp TIMESTAMP
) 
LOCATION '{silver_table_path}'
"""
)

# Transform and load silver table
source_table = get_table_name(schema_bronze, "lines_bz")

df_transformed = spark.read.table(source_table).select(
    col("id").alias("line_id"),
    get(col("serviceTypes"), 0)["name"].alias("service_type"),
    get(col("lineStatuses"), 0)["statusSeverity"].alias("severity_code"),
    get(col("lineStatuses"), 0)["statusSeverityDescription"].alias(
        "severity_description"
    ),
    get(col("lineStatuses"), 0)["disruption"]["category"].alias("disruption_category"),
    get(col("lineStatuses"), 0)["disruption"]["description"].alias(
        "disruption_description"
    ),
    to_timestamp(
        get(get(col("lineStatuses"), 0)["validityPeriods"], 0)["fromDate"]
    ).alias("disruption_from_date"),
    to_timestamp(
        get(get(col("lineStatuses"), 0)["validityPeriods"], 0)["toDate"]
    ).alias("disruption_to_date"),
    when(
        get(col("lineStatuses"), 0)["disruption"]["description"].isNotNull()
        & (get(col("lineStatuses"), 0)["disruption"]["description"] != ""),
        lit(True),
    )
    .otherwise(lit(False))
    .alias("is_service_disrupted"),
    col("created").cast("timestamp").alias("event_timestamp"),
)

# Clean data
df_cleaned = trim_strings(df_transformed)
df_deduped = df_cleaned.dropDuplicates(["line_id", "event_timestamp"])

df_quality = add_quality_flag(
    df_deduped, not_null_columns=["line_id", "event_timestamp"]
)
df_silver = add_transformation_metadata(df_quality)

# Write to silver table
query = (
    df_silver.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{catalog}.{schema_silver}.{target_table}")
)