# Merge Wiener Linien Historical Data with Station HW 25

This notebook merges Wiener Linien historical incidents with station HW 25 weather data
using an as-of merge on `timestamp` (last known weather at or before the incident).

Filters applied:
- Ignore records where `PartitionKey == "stoerunglang"`
- Ignore records where `category == "stoerunglang"`


In [None]:
import json
import os

import pandas as pd
from pymongo import MongoClient


In [None]:
MONGO_URI = os.getenv("MONGO_URI", "mongodb://mongodb:27017")
DB_NAME = os.getenv("MONGO_DB", "big_data_austria")
WL_COLLECTION = os.getenv("WL_HIST_COLLECTION", "wienerlinien_historical")
HW_COLLECTION = os.getenv("HW_COLLECTION", "Station_HW_25")


In [None]:
TIMESTAMP_CANDIDATES = [
    "timestamp",
    "time",
    "eventTime",
    "event_time",
    "startTime",
    "start_time",
    "fromTime",
    "from_time",
    "from",
    "createdAt",
    "created_at",
    "lastUpdate",
    "last_update",
]


def _parse_data(value):
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            return {}
    return {}


def _normalize_wl(df):
    if "data" not in df.columns:
        return df
    data_series = df["data"].apply(_parse_data)
    data_df = pd.json_normalize(data_series, sep="_")
    return pd.concat([df.drop(columns=["data"]), data_df], axis=1)


def _filter_stoerunglang(df):
    if "PartitionKey" in df.columns:
        df = df[~df["PartitionKey"].fillna("").str.lower().eq("stoerunglang")]
    if "category" in df.columns:
        df = df[~df["category"].fillna("").str.lower().eq("stoerunglang")]
    return df


def _pick_timestamp_column(df, candidates):
    for candidate in candidates:
        for col in df.columns:
            if col.lower() == candidate.lower():
                return col
    return None


def _to_timestamp(series):
    return pd.to_datetime(series, errors="coerce", utc=True)


In [None]:
client = MongoClient(MONGO_URI)
db = client[DB_NAME]

wl_docs = list(db[WL_COLLECTION].find({}, {"_id": 0}))
hw_docs = list(db[HW_COLLECTION].find({}, {"_id": 0}))

wl_df = pd.DataFrame(wl_docs)
hw_df = pd.DataFrame(hw_docs)

print(f"Wiener Linien records: {len(wl_df)}")
print(f"HW 25 records: {len(hw_df)}")


In [None]:
wl_df = _normalize_wl(wl_df)
wl_df = _filter_stoerunglang(wl_df)

wl_timestamp_col = _pick_timestamp_column(wl_df, TIMESTAMP_CANDIDATES)
if wl_timestamp_col is None:
    raise ValueError("No timestamp column found in Wiener Linien data. Columns: " + .join(wl_df.columns))

wl_df["timestamp"] = _to_timestamp(wl_df[wl_timestamp_col])
wl_df = wl_df.dropna(subset=["timestamp"]).sort_values("timestamp")

wl_df.head()


In [None]:
if "timestamp" not in hw_df.columns:
    raise ValueError("HW 25 data has no 'timestamp' column. Columns: " + .join(hw_df.columns))

hw_df["timestamp"] = _to_timestamp(hw_df["timestamp"])
hw_df = hw_df.dropna(subset=["timestamp"]).sort_values("timestamp")

hw_df.head()


In [None]:
merged = pd.merge_asof(
    wl_df,
    hw_df,
    on="timestamp",
    direction="backward",
)

merged.head()


In [None]:
# Optional: persist the merged data
# output_path = "merged_wienerlinien_hw25.csv"
# merged.to_csv(output_path, index=False)
# output_path

# Optional: write to MongoDB
# OUTPUT_COLLECTION = os.getenv("MERGED_COLLECTION", "wienerlinien_hw25_merged")
# if not merged.empty:
#     db[OUTPUT_COLLECTION].insert_many(merged.to_dict(orient="records"))
# OUTPUT_COLLECTION
