# Merge Wiener Linien Historical Data with Station HW 25

This notebook merges Wiener Linien historical incidents with station HW 25 weather data
using an as-of merge on `timestamp` (last known weather at or before the incident).

Filters applied:
- Ignore records where `PartitionKey == "stoerunglang"`
- Ignore records where `category == "stoerunglang"`


In [1]:
import json
import os

import pandas as pd
from pymongo import MongoClient


In [2]:
MONGO_URI = os.getenv("MONGO_URI", "mongodb://mongodb:27017")
DB_NAME = os.getenv("MONGO_DB", "big_data_austria")
WL_COLLECTION = os.getenv("WL_HIST_COLLECTION", "wienerlinien_historical")
HW_COLLECTION = os.getenv("HW_COLLECTION", "Station_HW_25")


In [3]:
TIMESTAMP_CANDIDATES = [
    "timestamp",
    "time_start",
    "timestart",
    "time",
    "eventTime",
    "event_time",
    "startTime",
    "start_time",
    "fromTime",
    "from_time",
    "from",
    "firstSeen",
    "lastSeen",
    "endDate",
    "resumeDate",
    "createdAt",
    "created_at",
    "lastUpdate",
    "last_update",
]


def _parse_data(value):
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            return {}
    return {}


def _normalize_wl(df):
    if "data" not in df.columns:
        return df
    data_series = df["data"].apply(_parse_data)
    data_df = pd.json_normalize(data_series, sep="_")
    return pd.concat([df.drop(columns=["data"]), data_df], axis=1)


def _filter_stoerunglang(df):
    if "PartitionKey" in df.columns:
        df = df[~df["PartitionKey"].fillna("").str.lower().eq("stoerunglang")]
    if "category" in df.columns:
        df = df[~df["category"].fillna("").str.lower().eq("stoerunglang")]
    return df


def _pick_timestamp_column(df, candidates):
    for candidate in candidates:
        for col in df.columns:
            if col.lower() == candidate.lower():
                return col
    return None


def _to_timestamp(series):
    return pd.to_datetime(series, errors="coerce", utc=True)


In [4]:
client = MongoClient(MONGO_URI)
db = client[DB_NAME]

wl_docs = list(db[WL_COLLECTION].find({}, {"_id": 0}))
hw_docs = list(db[HW_COLLECTION].find({}, {"_id": 0}))

wl_df = pd.DataFrame(wl_docs)
hw_df = pd.DataFrame(hw_docs)

print(f"Wiener Linien records: {len(wl_df)}")
print(f"HW 25 records: {len(hw_df)}")


Wiener Linien records: 31112
HW 25 records: 8641


In [5]:
wl_df = _normalize_wl(wl_df)
wl_df = _filter_stoerunglang(wl_df)

wl_timestamp_col = _pick_timestamp_column(wl_df, TIMESTAMP_CANDIDATES)
if wl_timestamp_col is None:
    raise ValueError("No timestamp column found in Wiener Linien data. Columns: " + ", ".join(wl_df.columns))

wl_df["timestamp"] = _to_timestamp(wl_df[wl_timestamp_col])
wl_df = wl_df.dropna(subset=["timestamp"]).sort_values("timestamp")

wl_df.head()


Unnamed: 0,RowKey,PartitionKey,category,dataHash,endDate,exportDate,firstSeen,imported_at,lastSeen,migration,...,attributes_relatedLineTypes_61A,attributes_relatedLineTypes_N46,attributes_relatedLineTypes_30A,attributes_relatedLineTypes_98A,attributes_relatedLineTypes_84A,attributes_relatedLineTypes_N8,attributes_relatedLineTypes_42A,attributes_relatedLineTypes_89A,attributes_relatedLineTypes_5A,timestamp
28039,ivu_664_-2_8a5f805b,stoerungkurz,stoerungkurz,8a5f805b,2025-11-11T12:20:00.000+0100,2025-12-16T13:39:15.722843+01:00,2023-08-08T11:20:00.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-11T12:20:00+01:00,True,...,,,,,,,,,,2023-08-08 10:20:00+00:00
29203,ivu_1458_149_d7a5e9cc,stoerungkurz,stoerungkurz,d7a5e9cc,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,,,,,,,,2025-10-28 23:00:04+00:00
29204,ivu_1462_149_a9865562,stoerungkurz,stoerungkurz,a9865562,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,,,,,,,,2025-10-28 23:00:04+00:00
29205,ivu_1462_549_f7e681cf,stoerungkurz,stoerungkurz,f7e681cf,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,,,,,,,,2025-10-28 23:00:04+00:00
28898,ivu_708_413_401cf175,stoerungkurz,stoerungkurz,401cf175,2025-11-06T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:05.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-06T23:59:59+01:00,True,...,,,,,,,,,,2025-10-28 23:00:05+00:00


In [6]:
if "timestamp" not in hw_df.columns:
    raise ValueError("HW 25 data has no 'timestamp' column. Columns: " + ", ".join(hw_df.columns))

hw_df["timestamp"] = _to_timestamp(hw_df["timestamp"])
hw_df = hw_df.dropna(subset=["timestamp"]).sort_values("timestamp")

hw_df.head()


Unnamed: 0,timestamp,RR,TL,P,FF,SO,RF,station_id
0,2025-11-01 00:00:00+00:00,0.0,8.7,993.5,0.3,0.0,78.0,HW_25
1,2025-11-01 00:10:00+00:00,0.0,8.5,993.5,0.5,0.0,79.0,HW_25
2,2025-11-01 00:20:00+00:00,0.0,8.5,993.7,1.2,0.0,80.0,HW_25
3,2025-11-01 00:30:00+00:00,0.0,8.3,993.6,0.6,0.0,82.0,HW_25
4,2025-11-01 00:40:00+00:00,0.0,8.3,993.6,0.4,0.0,82.0,HW_25


In [7]:
merged = pd.merge_asof(
    wl_df,
    hw_df,
    on="timestamp",
    direction="backward",
)

merged.head()


Unnamed: 0,RowKey,PartitionKey,category,dataHash,endDate,exportDate,firstSeen,imported_at,lastSeen,migration,...,attributes_relatedLineTypes_89A,attributes_relatedLineTypes_5A,timestamp,RR,TL,P,FF,SO,RF,station_id
0,ivu_664_-2_8a5f805b,stoerungkurz,stoerungkurz,8a5f805b,2025-11-11T12:20:00.000+0100,2025-12-16T13:39:15.722843+01:00,2023-08-08T11:20:00.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-11T12:20:00+01:00,True,...,,,2023-08-08 10:20:00+00:00,,,,,,,
1,ivu_1458_149_d7a5e9cc,stoerungkurz,stoerungkurz,d7a5e9cc,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,2025-10-28 23:00:04+00:00,,,,,,,
2,ivu_1462_149_a9865562,stoerungkurz,stoerungkurz,a9865562,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,2025-10-28 23:00:04+00:00,,,,,,,
3,ivu_1462_549_f7e681cf,stoerungkurz,stoerungkurz,f7e681cf,2025-11-05T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:04.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-05T23:59:59+01:00,True,...,,,2025-10-28 23:00:04+00:00,,,,,,,
4,ivu_708_413_401cf175,stoerungkurz,stoerungkurz,401cf175,2025-11-06T23:59:59.000+0100,2025-12-16T13:39:15.722843+01:00,2025-10-29T00:00:05.000+0100,2026-01-14T21:59:38.865500+00:00,2025-11-06T23:59:59+01:00,True,...,,,2025-10-28 23:00:05+00:00,,,,,,,


In [8]:
# Persist merged data to CSV and MongoDB
output_path = os.getenv("MERGED_CSV_PATH", "merged_wienerlinien_hw25.csv")
merged.to_csv(output_path, index=False)
print(f"CSV written: {output_path}")

OUTPUT_COLLECTION = os.getenv("MERGED_COLLECTION", "wienerlinien_hw25_merged")
merged_out = merged.copy()
if "timestamp" in merged_out.columns and pd.api.types.is_datetime64_any_dtype(merged_out["timestamp"]):
    if merged_out["timestamp"].dt.tz is not None:
        merged_out["timestamp"] = merged_out["timestamp"].dt.tz_convert("UTC").dt.tz_localize(None)
if not merged_out.empty:
    db[OUTPUT_COLLECTION].insert_many(merged_out.to_dict(orient="records"))
print(f"MongoDB written: {OUTPUT_COLLECTION} ({len(merged_out)} records)")


CSV written: merged_wienerlinien_hw25.csv


  db[OUTPUT_COLLECTION].insert_many(merged_out.to_dict(orient="records"))


MongoDB written: wienerlinien_hw25_merged (12694 records)
