In [4]:


# --- 1. Setup ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt

# --- 2. Load Data ---
df = pd.read_csv("/Users/amelijaancupane/Desktop/TU_DELFT/Year 2/Advanced Data Analysis/NS_Project/Group2_NS-1/data/data_NS_filtered.csv", sep=";")

# 2. Data Cleaning
# Convert delay from string (HH:MM:SS) to numeric minutes
df["delay"] = pd.to_timedelta(df["delay"]).dt.total_seconds() / 60

# Drop rows where target is missing
df = df.dropna(subset=["REALISATIE"])

# Fill categorical NaNs with "Unknown"
for col in ["DAGDEELTREIN", "TREINSERIEBASIS"]:
    df[col] = df[col].fillna("Unknown")

# Flag missing delay (NaN for cancelled trains)
df["delay_missing"] = df["delay"].isna().astype(int)

# Fill delay NaN with 0 so it's numeric
df["delay"] = df["delay"].fillna(0)

# Fill in missing values for PROGNOSE_REIZEN (NaN if trains are extra)
df["prognose_missing"] = df["PROGNOSE_REIZEN"].isna().astype(int)
df["PROGNOSE_REIZEN"] = df["PROGNOSE_REIZEN"].fillna(0)



In [None]:
# Target
y = df["REALISATIE"]


# Features to use (excluding AFWIJKING, BEWEGINGNUMMER)
features = ["WEEK_DAG_NR", "TRAJECT", "DAGDEELTREIN", "TREINSERIEBASIS",
            "PROGNOSE_REIZEN", "station1", "station2", 
            "Cancelled", "ExtraTrain", "delay", "delay_missing", "prognose_missing"]

X = df[features]

# Convert categoricals with one-hot encoding
categorical_cols = ["TRAJECT", "DAGDEELTREIN", "TREINSERIEBASIS", "station1", "station2"]
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Chronological split by DAGNR
cutoff_day = df["DAGNR"].max() - 7
X_train = X[df["DAGNR"] <= cutoff_day]
y_train = y[df["DAGNR"] <= cutoff_day]

X_test = X[df["DAGNR"] > cutoff_day]
y_test = y[df["DAGNR"] > cutoff_day]

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (75153, 133) Test: (23459, 133)


In [7]:

# --- 6. Model: Gradient Boosting ---
gbr = GradientBoostingRegressor(random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", gbr)
])

# --- 7. Train ---
pipeline.fit(X_train, y_train)

# --- 8. Evaluate ---
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Gradient Boosted Trees Results on Test Set:")
print("MAE:", round(mae, 2))
print("RMSE:", round(rmse, 2))
print("R²:", round(r2, 3))

# Operator baseline
op_mae = mean_absolute_error(y_test, X_test["PROGNOSE_REIZEN"])
op_rmse = np.sqrt(mean_squared_error(y_test, X_test["PROGNOSE_REIZEN"]))
op_r2 = r2_score(y_test, X_test["PROGNOSE_REIZEN"])

print("\nOperator Forecast (PROGNOSE_REIZEN) Results:")
print("MAE:", round(op_mae, 2))
print("RMSE:", round(op_rmse, 2))
print("R²:", round(op_r2, 3))

# --- 9. Feature Importance ---
importances = pipeline.named_steps["model"].feature_importances_
feature_names = (
    numeric_features 
    + list(pipeline.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical_features))
)

fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)
fi.head(15).plot(kind="barh", figsize=(8,6))
plt.title("Top Feature Importances - Gradient Boosted Trees")
plt.show()

ValueError: A given column is not a column of the dataframe