In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, make_scorer

#load dataset
df = pd.read_csv("cleaned_dataset.csv", sep=",")

Select relevant features for training 

In [None]:
features = [
    "Number of scheduled trains",
    "Number of cancelled trains",
    "Number of trains delayed at departure",
    "Average delay of late trains at departure",
    "Average delay of all trains at departure",
    "Number of trains delayed > 15min",
    "Number of trains delayed > 30min",
    "Number of trains delayed > 60min",
    "Pct delay due to external causes",
    "Pct delay due to infrastructure",
    "Pct delay due to traffic management",
    "Pct delay due to rolling stock",
    "Pct delay due to station management and equipment reuse",
    "Pct delay due to passenger handling (crowding, disabled persons, connections)",
    "pct_delayed",
    "pct_cancelled",
]

# select features columns X and prediction column y
X = df[features]
y = df['Number of trains delayed at arrival']

Train a simple machine learning model using scikit-learn (Linear Regresssion)

In [None]:
#split by train set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create and train model

model = LinearRegression()
model.fit(X_train, y_train)

#predict on the test set

y_pred = model.predict(X_test)

Evaluate performance using metrics like RMSE, R², or accuracy

In [None]:
print("Coefficients :", model.coef_)
print("Intercept :", model.intercept_)
print("RMSE :", root_mean_squared_error(y_test, y_pred))
print("R² :", r2_score(y_test, y_pred))

Compare different models and justify the selection of the best one(Random forest prediction)

In [None]:

X = df[features]
y = df['Number of trains delayed at arrival']

#split by train set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create and train model
rf_model = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

#predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate performance using metrics like RMSE, R², or accuracy

print("RMSE :", root_mean_squared_error(y_test, y_pred))
print("R² :", r2_score(y_test, y_pred))

#Variables impact
impact = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)
print("\nColumns impact:\n", impact)


Tune hyperparameters to improve model performance


In [None]:
# basic model
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# list of values ​​that GridSearchCV should test
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt']
}

# choose the metric

scorer = make_scorer(r2_score)

# begin and launch the research

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

# see results

print("Best parameters :", grid_search.best_params_)
print("Best R² :", grid_search.best_score_)