# Analysis Layer

This notebook explores delay-weather relationships, time series patterns, duration analysis,
a simple predictive model, and text analytics.


In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid")
except ImportError:
    sns = None

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)


In [None]:
def _find_path(candidates):
    for candidate in candidates:
        path = Path(candidate)
        if path.exists():
            return path
    return None

CLEANED_CSV = os.getenv("CLEANED_CSV_PATH", "merged_wienerlinien_hw25.csv")
DAILY_CSV = os.getenv("DAILY_CSV_PATH", "wienerlinien_hw25_daily.csv")

cleaned_path = _find_path([
    CLEANED_CSV,
    Path("notebooks") / CLEANED_CSV,
    Path.cwd() / CLEANED_CSV,
    Path.cwd() / "notebooks" / CLEANED_CSV,
])
daily_path = _find_path([
    DAILY_CSV,
    Path("notebooks") / DAILY_CSV,
    Path.cwd() / DAILY_CSV,
    Path.cwd() / "notebooks" / DAILY_CSV,
])

if cleaned_path is None:
    raise FileNotFoundError(f"Could not find cleaned CSV: {CLEANED_CSV}")
if daily_path is None:
    raise FileNotFoundError(f"Could not find daily CSV: {DAILY_CSV}")

cleaned = pd.read_csv(cleaned_path, parse_dates=["start_time", "end_time"])
daily_summary = pd.read_csv(daily_path, parse_dates=["date"])

print(f"Loaded cleaned: {cleaned.shape} from {cleaned_path}")
print(f"Loaded daily: {daily_summary.shape} from {daily_path}")
cleaned.head()


## Delay-Weather Relationship


In [None]:
target_cols = ["RR", "FF", "TL"]
available_cols = [c for c in target_cols if c in daily_summary.columns]

if not available_cols:
    print("No RR/FF/TL columns found in daily_summary.")
else:
    for col in available_cols:
        plt.figure(figsize=(6, 4))
        if sns is not None:
            sns.scatterplot(data=daily_summary, x=col, y="delay_count")
        else:
            plt.scatter(daily_summary[col], daily_summary["delay_count"], alpha=0.6)
            plt.xlabel(col)
            plt.ylabel("delay_count")
        plt.title(f"Delay Count vs {col}")
        plt.tight_layout()
        plt.show()

    corr_cols = ["delay_count"] + available_cols
    print(daily_summary[corr_cols].corr(numeric_only=True))


In [None]:
if "RR" in daily_summary.columns:
    rain_threshold = 0.0
    daily_summary["rainy_day"] = daily_summary["RR"] > rain_threshold
    plt.figure(figsize=(6, 4))
    if sns is not None:
        sns.boxplot(data=daily_summary, x="rainy_day", y="delay_count")
    else:
        groups = [
            daily_summary.loc[daily_summary["rainy_day"], "delay_count"].dropna(),
            daily_summary.loc[~daily_summary["rainy_day"], "delay_count"].dropna(),
        ]
        plt.boxplot(groups, labels=["rainy", "dry"])
        plt.ylabel("delay_count")
    plt.title("Delay Count: Rainy vs Dry Days")
    plt.tight_layout()
    plt.show()
else:
    print("RR column not found; skipping rainy vs dry boxplot.")


## Time Series Analytics


In [None]:
ts = daily_summary.sort_values("date").copy()
ts["rolling_7d"] = ts["delay_count"].rolling(7, min_periods=1).mean()

plt.figure(figsize=(10, 4))
plt.plot(ts["date"], ts["delay_count"], label="daily")
plt.plot(ts["date"], ts["rolling_7d"], label="7-day mean")
plt.title("Daily Delay Count with 7-day Rolling Mean")
plt.xlabel("date")
plt.ylabel("delay_count")
plt.legend()
plt.tight_layout()
plt.show()

ts["weekday"] = ts["date"].dt.day_name()
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
weekday_summary = (
    ts.groupby("weekday")["delay_count"].mean().reindex(weekday_order)
)

plt.figure(figsize=(8, 4))
weekday_summary.plot(kind="bar")
plt.title("Average Delay Count by Weekday")
plt.xlabel("weekday")
plt.ylabel("avg delay_count")
plt.tight_layout()
plt.show()

threshold = ts["delay_count"].quantile(0.99)
event_days = ts[ts["delay_count"] >= threshold]
weather_cols = [c for c in ["RR", "TL", "P", "FF", "SO", "S0", "RF"] if c in ts.columns]
display_cols = ["date", "delay_count"] + weather_cols
event_days[display_cols].sort_values("delay_count", ascending=False).head(10)


## Duration Analysis


In [None]:
dur = cleaned.dropna(subset=["start_time", "end_time"]).copy()
dur["duration_min"] = (dur["end_time"] - dur["start_time"]).dt.total_seconds() / 60.0
dur = dur[dur["duration_min"] >= 0]

plt.figure(figsize=(6, 4))
plt.hist(dur["duration_min"].dropna(), bins=30)
plt.title("Distribution of Delay Duration (minutes)")
plt.xlabel("minutes")
plt.ylabel("count")
plt.tight_layout()
plt.show()

if "FF" in dur.columns:
    dur["wind_bin"] = np.where(dur["FF"] > 10, "wind>10", "wind<=10")
    plt.figure(figsize=(6, 4))
    if sns is not None:
        sns.boxplot(data=dur, x="wind_bin", y="duration_min")
    else:
        groups = [
            dur.loc[dur["wind_bin"] == "wind>10", "duration_min"].dropna(),
            dur.loc[dur["wind_bin"] == "wind<=10", "duration_min"].dropna(),
        ]
        plt.boxplot(groups, labels=["wind>10", "wind<=10"])
        plt.ylabel("duration_min")
    plt.title("Duration by Wind Bin (FF > 10)")
    plt.tight_layout()
    plt.show()
else:
    print("FF column not found; skipping wind bin analysis.")


## Predictive Model (High Delay Day)


In [None]:
try:
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import classification_report, roc_auc_score
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    model_df = daily_summary.dropna(subset=["date", "delay_count"]).sort_values("date").copy()
    model_df["weekday"] = model_df["date"].dt.weekday
    model_df["month"] = model_df["date"].dt.month

    target = model_df["delay_count"] >= model_df["delay_count"].quantile(0.90)
    model_df["high_delay"] = target.astype(int)

    feature_candidates = ["RR", "TL", "P", "FF", "SO", "S0", "RF", "weekday", "month"]
    feature_cols = [c for c in feature_candidates if c in model_df.columns]
    if not feature_cols:
        raise ValueError("No feature columns available for modeling.")

    X = model_df[feature_cols].copy()
    X = X.fillna(X.median(numeric_only=True))
    y = model_df["high_delay"]

    split_idx = int(len(model_df) * 0.8)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    model = Pipeline([
        ("scale", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred, digits=3))
    try:
        print("ROC AUC:", roc_auc_score(y_test, y_proba))
    except ValueError as exc:
        print("ROC AUC not available:", exc)

except ImportError as exc:
    print("scikit-learn is not available. Install it to run the model.")
    print(exc)


## Text Analytics (Keywords and Weather Context)


In [None]:
import json

from pymongo import MongoClient

MONGO_URI = os.getenv("MONGO_URI", "mongodb://mongodb:27017")
DB_NAME = os.getenv("MONGO_DB", "big_data_austria")
WL_COLLECTION = os.getenv("WL_HIST_COLLECTION", "wienerlinien_historical")
HW_COLLECTION = os.getenv("HW_COLLECTION", "Station_HW_25")

def _parse_data(value):
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            return json.loads(value)
        except json.JSONDecodeError:
            return {}
    return {}

def _normalize_wl(df):
    if "data" not in df.columns:
        return df
    data_series = df["data"].apply(_parse_data)
    data_df = pd.json_normalize(data_series, sep="_")
    return pd.concat([df.drop(columns=["data"]), data_df], axis=1)

def _filter_stoerunglang(df):
    if "PartitionKey" in df.columns:
        df = df[~df["PartitionKey"].fillna("").str.lower().eq("stoerunglang")]
    if "category" in df.columns:
        df = df[~df["category"].fillna("").str.lower().eq("stoerunglang")]
    return df

client = MongoClient(MONGO_URI)
db = client[DB_NAME]

wl_docs = list(db[WL_COLLECTION].find({}, {"_id": 0}))
hw_docs = list(db[HW_COLLECTION].find({}, {"_id": 0}))

wl_df = pd.DataFrame(wl_docs)
hw_df = pd.DataFrame(hw_docs)

wl_df = _normalize_wl(wl_df)
wl_df = _filter_stoerunglang(wl_df)

if "time_start" not in wl_df.columns:
    raise ValueError("Expected 'time_start' in Wiener Linien data for text analytics.")

wl_df["timestamp"] = pd.to_datetime(wl_df["time_start"], errors="coerce", utc=True)
wl_df = wl_df.dropna(subset=["timestamp"]).sort_values("timestamp")

if "timestamp" not in hw_df.columns:
    raise ValueError("HW 25 data has no 'timestamp' column.")

hw_df["timestamp"] = pd.to_datetime(hw_df["timestamp"], errors="coerce", utc=True)
hw_df = hw_df.dropna(subset=["timestamp"]).sort_values("timestamp")

wl_weather = pd.merge_asof(
    wl_df,
    hw_df,
    on="timestamp",
    direction="backward",
)

text_col = None
for candidate in ["description", "data_description", "title", "data_title"]:
    if candidate in wl_weather.columns:
        text_col = candidate
        break

if text_col is None:
    raise ValueError("No text column found for keyword analysis.")

wl_weather["text"] = wl_weather[text_col].fillna("").str.lower()

keywords = [
    "bauarbeiten",
    "unfall",
    "stoerung",
    "signal",
    "weiche",
    "fahrzeug",
    "polizei",
    "defekt",
]

for kw in keywords:
    wl_weather[f"kw_{kw}"] = wl_weather["text"].str.contains(kw, regex=False)

if "RR" in wl_weather.columns:
    wl_weather["rainy"] = wl_weather["RR"] > 0
    keyword_cols = [f"kw_{kw}" for kw in keywords]
    keyword_rates = (
        wl_weather.groupby("rainy")[keyword_cols].mean().T
        .rename(columns={True: "rainy", False: "dry"})
    )
    keyword_rates.sort_values(by="rainy", ascending=False).head(10)
else:
    print("RR column not available for rainy vs dry comparison.")
