In [None]:
%run ../config/load_config

In [None]:
from pyspark.sql.functions import *

target_table = "arrival_events_sv"
silver_table_path = get_storage_path("silver", target_table)

# Ensure silver table exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {catalog}.{schema_silver}.{target_table}(
    arrival_event_id BIGINT, 
    line_id STRING,
    vehicle_id STRING,
    naptan_id STRING,
    station_name STRING,
    platform_name STRING,
    direction STRING,
    destination_name STRING,
    time_to_station BIGINT,
    expected_arrival TIMESTAMP,
    time_to_live TIMESTAMP,
    is_service_disrupted BOOLEAN,
    severity_code BIGINT, 
    severity_description STRING, 
    event_timestamp TIMESTAMP
)    
LOCATION '{silver_table_path}'
"""
)

# Join arrivals_sv and lines_sv to derive arrival_events_sv
spark.sql(f"""
MERGE INTO {catalog}.{schema_silver}.{target_table} AS target
USING (
    SELECT
        ba.arrival_id AS arrival_event_id,
        ba.line_id,
        ba.vehicle_id,
        ba.naptan_id,
        ba.station_name,
        ba.platform_name,
        ba.direction,
        ba.destination_name,
        ba.time_to_station,
        ba.expected_arrival,
        ba.time_to_live,
        COALESCE(ls.is_service_disrupted, FALSE) AS is_service_disrupted,
        ls.severity_code,
        ls.severity_description,
        ba.event_timestamp
    FROM {catalog}.{schema_silver}.arrivals_sv ba
    LEFT JOIN {catalog}.{schema_silver}.lines_sv ls
        ON ba.line_id = ls.line_id
        AND ba.event_timestamp BETWEEN
            ls.event_timestamp - INTERVAL 30 SECONDS
        AND ls.event_timestamp + INTERVAL 30 SECONDS
) AS source
    ON target.arrival_event_id = source.arrival_event_id
    AND target.event_timestamp   = source.event_timestamp
WHEN NOT MATCHED THEN INSERT *
"""
)