Building a prediction model is the third step of the Tardis Project. The goal is to use the values in our cleaned dataset to predict future delays.

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import make_scorer, mean_squared_error, root_mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer

In [41]:
def hms_to_minutes(hms):
    if pd.isna(hms):
        return None
    parts = str(hms).split('.')
    if len(parts) == 3:
        hours, minutes, seconds = map(int, parts)
        return hours * 60 + minutes + seconds / 60
    else:
        return None  # or raise an error

First, we load our cleaned dataset with pandas

In [42]:
df = pd.read_csv('cleaned_dataset.csv', delimiter=',', on_bad_lines='warn')

columns_to_convert = [
    "Average journey time",
    "Average delay of late trains at departure",
    "Average delay of all trains at departure",
    "Average delay of late trains at arrival",
    "Average delay of all trains at arrival",
    "Average delay of trains > 15min (if competing with flights)",
    "Pct delay due to external causes",
    "Pct delay due to infrastructure",
    "Pct delay due to traffic management",
    "Pct delay due to rolling stock",
    "Pct delay due to station management and equipment reuse",
    "Pct delay due to passenger handling (crowding, disabled persons, connections)",
]

for col in columns_to_convert:
    if col in df.columns:
        df[col] = df[col].apply(hms_to_minutes)

Then we decide which columns  

In [43]:
X = df[
    ['Date',
     'Service',
     'Departure station',
     'Arrival station',
     'Average journey time',
     'Number of scheduled trains',
     'Number of cancelled trains',
     'Number of trains delayed at departure',
     'Number of trains delayed at arrival',
     'Number of trains delayed > 15min']
]

y = df[
    ['Average delay of late trains at departure',
     'Average delay of all trains at departure',
     'Average delay of late trains at arrival',
     'Average delay of all trains at arrival',
     'Average delay of trains > 15min (if competing with flights)',
     ]
]

#maybe add this later
'''Pct delay due to external causes', 'Pct delay due to infrastructure'
     'Pct delay due to traffic management',
     'Pct delay due to rolling stock',
     'Pct delay due to station management and equipment reuse',
     'Pct delay due to passenger handling (crowding, disabled persons, connections)'''

for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        X[col].fillna(X[col].median())
    else:
        X[col].fillna(X[col].mode()[0])

y.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(RandomForestRegressor(random_state=42)))
    ])

    return pipe

In [45]:
numeric_features = ['Average journey time',
     'Number of scheduled trains',
     'Number of cancelled trains',
     'Number of trains delayed at departure',
     'Number of trains delayed at arrival',
     'Number of trains delayed > 15min']

categorical_features = ["Date", "Service", "Departure station", "Arrival station"]

pipeline = build_pipeline(numeric_features, categorical_features)

def multioutput_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred, multioutput='uniform_average'))

rmse_scorer = make_scorer(multioutput_rmse, greater_is_better=False)



param_grid = {
    'regressor__estimator__n_estimators': [50, 100],
    'regressor__estimator__max_depth': [5, 10, None]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3/dist-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3/dist-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_11009/2579112059.py", line 13, in multioutput_rmse
  File "/usr/lib/python3/dist-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  Fi

KeyboardInterrupt: 

In [37]:
# Evaluate
for i, col in enumerate(y_test.columns):
    col_rmse = np.sqrt(mean_squared_error(y_test.iloc[:, i], y_pred[:, i]))
    print(f"Test RMSE for {col}: {col_rmse:.2f}")

# Best params
print("Best parameters found:", grid_search.best_params_)

Test RMSE for Average delay of late trains at departure: 26.08
Test RMSE for Average delay of all trains at departure: 0.00
Test RMSE for Average delay of late trains at arrival: 70.20
Test RMSE for Average delay of all trains at arrival: 13.07
Test RMSE for Average delay of trains > 15min (if competing with flights): 72.38
Best parameters found: {'regressor__estimator__max_depth': 5, 'regressor__estimator__n_estimators': 100}


add multi output
automate tests for different model
add visualisation