In [0]:
from pyspark.sql import functions as F

storage_account = "sbbapistorageaccount"
container = "data-container"
account_key = ""

spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    account_key
)

# -------------------------------
# 1️⃣ Load Bronze
# -------------------------------
df_bronze = spark.table("bronze_sbb")

In [0]:

# -------------------------------
# 2️⃣ Rename columns to English
# -------------------------------
rename_cols = {
    "ab_prognose": "departure_forecast",
    "ab_prognose_status": "departure_status",
    "abfahrtsverspatung": "departure_delay",
    "abfahrtszeit": "scheduled_departure",
    "an_prognose": "arrival_forecast",
    "an_prognose_status": "arrival_status",
    "ankunftsverspatung": "arrival_delay",
    "ankunftszeit": "scheduled_arrival",
    "betreiber_abk": "operator_abbr",
    "betreiber_id": "operator_id",
    "betreiber_name": "operator_name",
    "betriebstag": "operation_day",
    "bpuic": "station_id",
    "durchfahrt_tf": "through_train_flag",
    "faellt_aus_tf": "canceled_flag",
    "fahrt_bezeichner": "trip_id",
    "geopos": "geo_position",
    "haltestellen_name": "station_name",
    "linien_id": "line_id",
    "linien_text": "line_text",
    "lod": "lod_url",
    "produkt_id": "product_id",
    "umlauf_id": "rotation_id",
    "verkehrsmittel_text": "transport_mode",
    "zusatzfahrt_tf": "extra_trip_flag",
    "total_count": "total_count"
}

df_silver = df_bronze.select([F.col(c).alias(rename_cols.get(c, c)) for c in df_bronze.columns])

In [0]:
# -------------------------------
# 3️⃣ Cleaning: Keep only IR75, year 2025
# -------------------------------
df_silver = (
    df_silver
    .filter(F.col("line_text") == "IR75")
    .filter(F.year("scheduled_departure") == 2025)
)

In [0]:

# -------------------------------
# 4️⃣ Drop useless columns
# -------------------------------
cols_to_drop = ["lod_url", "total_count", "through_train_flag", "extra_trip_flag"]
df_silver = df_silver.drop(*cols_to_drop)

In [0]:

# -------------------------------
# 5️⃣ Enrich / Cast types
# -------------------------------
# Cast datetime strings to timestamp
df_silver = (
    df_silver
    .withColumn("scheduled_departure", F.to_timestamp("scheduled_departure"))
    .withColumn("scheduled_arrival", F.to_timestamp("scheduled_arrival"))
    .withColumn("operation_day", F.to_date("operation_day"))
)

# Add delay in minutes
df_silver = df_silver.withColumn(
    "delay_minutes",
    F.when(F.col("departure_delay") == "false", F.lit(0))
     .otherwise(F.lit(1))  # ⚠️ depends if your delays are boolean or actual minutes
)

In [0]:
# -------------------------------
# 6️⃣ Save Silver as Delta
# -------------------------------
silver_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/silver/sbb"

df_silver.write.format("delta").mode("overwrite").save(silver_path)

# Register SQL table
spark.sql(f"""
CREATE TABLE IF NOT EXISTS silver_sbb
USING DELTA
LOCATION '{silver_path}'
""")

print("✅ Silver SBB table created successfully")

In [0]:
# -------------------------------
# 7️⃣ Quick check
# -------------------------------
df_silver.show(5, truncate=False)
df_silver.printSchema()