Building a prediction model is the third step of the Tardis Project. The goal is to use the values in our cleaned dataset to predict future delays.

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error

In [67]:
def hms_to_minutes(hms):
    if pd.isna(hms):
        return None
    parts = str(hms).split('.')
    if len(parts) == 3:
        hours, minutes, seconds = map(int, parts)
        return hours * 60 + minutes + seconds / 60
    else:
        return None  # or raise an error

First, we load our cleaned dataset with pandas

In [68]:
df = pd.read_csv('cleaned_dataset.csv', delimiter=',', on_bad_lines='warn')

columns_to_convert = [
    "Average journey time",
    "Average delay of late trains at departure",
    "Average delay of all trains at departure",
    "Average delay of late trains at arrival",
    "Average delay of all trains at arrival",
    "Average delay of trains > 15min (if competing with flights)",
    "Pct delay due to external causes",
    "Pct delay due to infrastructure",
    "Pct delay due to traffic management",
    "Pct delay due to rolling stock",
    "Pct delay due to station management and equipment reuse",
    "Pct delay due to passenger handling (crowding, disabled persons, connections)",
]

for col in columns_to_convert:
    if col in df.columns:
        df[col] = df[col].apply(hms_to_minutes)

Then we decide which columns  

In [69]:
X = df[
    ['Date',
     'Service',
     'Departure station',
     'Arrival station',
     'Average journey time',
     'Number of scheduled trains',
     'Number of cancelled trains',
     'Number of trains delayed at departure',
     'Number of trains delayed at arrival',
     'Number of trains delayed > 15min']
]

'''y = df[
    ['Average delay of late trains at departure',
     'Average delay of all trains at departure',
     'Average delay of late trains at arrival',
     'Average delay of all trains at arrival',
     'Average delay of trains > 15min (if competing with flights)',
     ]
]'''
y = df['Average delay of all trains at arrival']


#maybe add this later
'''Pct delay due to external causes', 'Pct delay due to infrastructure'
     'Pct delay due to traffic management',
     'Pct delay due to rolling stock',
     'Pct delay due to station management and equipment reuse',
     'Pct delay due to passenger handling (crowding, disabled persons, connections)'''

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

1. grid
2. pipeline
3. scale + model selection
4. test different models

In [70]:
def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state=42))
    ])

    return pipe

In [71]:
numeric_features = ['Average journey time',
     'Number of scheduled trains',
     'Number of cancelled trains',
     'Number of trains delayed at departure',
     'Number of trains delayed at arrival',
     'Number of trains delayed > 15min']

categorical_features = ["Date", "Service", "Departure station", "Arrival station"]

pipeline = build_pipeline(numeric_features, categorical_features)

rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [5, 10, None]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring=rmse_scorer,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)


[DEBUG] X_train head:
         Date   Service  Departure station       Arrival station  \
2436  2020-02  National         PARIS LYON                 NIMES   
746   2018-08  National     LYON PART DIEU  MARSEILLE ST CHARLES   
6307  2023-10  National         PARIS LYON                 NIMES   
2095      NaN  National  LA ROCHELLE VILLE    PARIS MONTPARNASSE   
4541  2022-02  National             RENNES    PARIS MONTPARNASSE   

      Average journey time  Number of scheduled trains  \
2436            180.000000                         178   
746             105.000000                         556   
6307           1702.583333                         201   
2095            165.000000                         193   
4541             98.000000                         758   

      Number of cancelled trains  Number of trains delayed at departure  \
2436                           0                                     76   
746                            2                                    3

In [72]:
# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.2f}")

# Best params
print("Best parameters found:", grid_search.best_params_)

Test RMSE: 13.07
Best parameters found: {'regressor__max_depth': 5, 'regressor__n_estimators': 100}
