In [0]:
%run ./01-config

In [0]:
# Create a VIEW for line reliability KPI in the Gold layer
def create_line_reliability_kpi():
    print(f"Creating VIEW {schema_gold}.line_reliability_kpi...", end="")

    spark.sql(f"""
        CREATE OR REPLACE VIEW {schema_gold}.line_reliability_kpi AS
        SELECT
            line_id,
            DATE(event_timestamp) AS service_date,
            COUNT(*) AS total_arrivals,
            SUM(CASE WHEN is_service_disrupted THEN 1 ELSE 0 END) AS disrupted_arrivals,
            ROUND(1 - (SUM(CASE WHEN is_service_disrupted THEN 1 ELSE 0 END) / COUNT(*)),4) AS reliability_ratio,
            CASE
                WHEN (1 - (SUM(CASE WHEN is_service_disrupted THEN 1 ELSE 0 END) / COUNT(*))) >= 0.95
                THEN 'meets_SLA'
                ELSE 'breach'
            END AS sla_status
        FROM {schema_silver}.bus_arrival_events
        GROUP BY line_id, DATE(event_timestamp)
    """)
    print("Done")


# Create a VIEW for line regularity KPI in the Gold layer
def create_line_regularity_kpi():
    print(f"Creating VIEW {schema_gold}.line_regularity_kpi...", end="")

    spark.sql(f"""
        CREATE OR REPLACE VIEW {schema_gold}.line_regularity_kpi AS
        SELECT
            line_id,
            DATE(event_timestamp) AS service_date,
            AVG(time_to_station) AS avg_wait_seconds,
            STDDEV(time_to_station) AS wait_time_variability,
            CASE
                WHEN STDDEV(time_to_station) <= 120 THEN 'good'
                WHEN STDDEV(time_to_station) <= 300 THEN 'acceptable'
                ELSE 'poor'
            END AS regularity_status
        FROM {schema_silver}.bus_arrival_events
        GROUP BY line_id, DATE(event_timestamp)
    """)
    print("Done")
    
    
# Create a VIEW for line disruption impact KPI in the Gold layer
def create_line_disruption_impact_kpi():
    print(f"Creating VIEW {schema_gold}.line_disruption_impact_kpi...", end="")

    spark.sql(f"""
        CREATE OR REPLACE VIEW {schema_gold}.line_disruption_impact_kpi AS
        SELECT
            line_id,
            borough_name,
            COUNT(DISTINCT disruption_description) AS disruption_count,
            COUNT(*) AS service_disruption_events,
            MAX(severity_code) AS max_severity_code,
            ROUND(
                COUNT(DISTINCT disruption_description)
                * COUNT(*)
                * MAX(severity_code),
                0
            ) AS disruption_impact_score,
            CASE
                WHEN MAX(severity_code) >= 10 THEN 'CRITICAL'
                WHEN MAX(severity_code) >= 5 THEN 'HIGH'
                ELSE 'LOW'
            END AS impact_level
        FROM {schema_silver}.line_disruption_geo
        WHERE is_service_disrupted = true
        GROUP BY line_id, borough_name;
    """)
    print("Done")


# Create a VIEW for borough service equity KPI in the Gold layer
def create_borough_service_equity_kpi():
    print(f"Creating VIEW {schema_gold}.borough_service_equity_kpi...", end="")

    spark.sql(f"""
        CREATE OR REPLACE VIEW {schema_gold}.borough_service_equity_kpi AS
        SELECT
            borough_name,
            COUNT(DISTINCT line_id) AS affected_lines,
            SUM(service_disruption_events) AS total_disruptions,
            AVG(max_severity_code) AS avg_severity,
            ROUND(SUM(service_disruption_events) / COUNT(DISTINCT line_id),2) AS disruption_density
        FROM {schema_gold}.line_disruption_impact_kpi
        GROUP BY borough_name;
    """)
    print("Done")


# Orchestrate creation of all Gold layer KPI VIEWs
def create_gold_kpis():
    import time
    start = int(time.time())
    print("\nExecuting gold layer KPI creation...")

    create_line_reliability_kpi()
    create_line_regularity_kpi()
    create_line_disruption_impact_kpi()
    create_borough_service_equity_kpi()

    print(f"✅ Completed gold layer in {int(time.time()) - start} seconds")
    

# Validate existence and row count of all Gold layer KPI VIEWs
def validate_gold_kpis():
    import time
    start = int(time.time())
    print("\nValidating Gold KPIs...")

    tables = ["line_reliability_kpi","line_regularity_kpi","line_disruption_impact_kpi","borough_service_equity_kpi"]

    for t in tables:
        assert spark.sql(f"SHOW TABLES IN {schema_gold}") \
            .filter(f"tableName='{t}' AND isTemporary=false") \
            .count() == 1, f"Missing table: {schema_gold}.{t}"

        rows = spark.table(f"{schema_gold}.{t}").count()
        assert rows > 0, f"Empty table: {schema_gold}.{t}"

        print(f"{t}: OK ({rows:,} rows)")

    print(f"✅ Gold KPI validation completed in {int(time.time()) - start} seconds")