In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, fbeta_score, precision_score, recall_score
import joblib

In [2]:
# Load dataset
data = pd.read_csv("../dataset/trains_refined.csv")

# Feature extraction
X = data[['train_ID', 'departure_time', 'arrival_time', 'train_type']]
y = data['delay']

In [3]:
# Split datastes in trining and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Random forest with 50 trees
random_forest_model = RandomForestClassifier(n_estimators=50)
random_forest_model.fit(X_train, y_train)

# Model predictions
y_pred = random_forest_model.predict(X_test)

In [5]:
# Results
print("--------------------- Random forest ---------------------\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F2 score:", fbeta_score(y_test, y_pred, beta=2, average="micro"), "micro", fbeta_score(y_test, y_pred, beta=2, average="macro"), "macro", fbeta_score(y_test, y_pred, beta=2, average="weighted"), "weighted")
print("ROC AUC score:", roc_auc_score(y_test, y_pred))
print("\n---------------------------------------------------------")

--------------------- Random forest ---------------------

Accuracy: 0.926213304339231
Precision: 0.9445171421093954
Recall: 0.8710027754313986
F2 score: 0.9262133043392309 micro 0.9194911856355366 macro 0.9257706533373032 weighted
ROC AUC score: 0.9177563471197339

---------------------------------------------------------


In [6]:
cross_validate(random_forest_model, X, y, cv=10, scoring=['f1', 'roc_auc'])

{'fit_time': array([7.15498781, 7.0705092 , 7.10279179, 7.2271359 , 7.25710344,
        7.3177886 , 7.46990728, 7.12944627, 7.11811709, 7.06035233]),
 'score_time': array([0.19364977, 0.19104266, 0.19787264, 0.20814705, 0.20047569,
        0.1946044 , 0.20747805, 0.20225525, 0.20365357, 0.20084119]),
 'test_f1': array([0.90962817, 0.92155388, 0.90918126, 0.9067954 , 0.92268886,
        0.90456749, 0.93090909, 0.90782828, 0.88567149, 0.88886204]),
 'test_roc_auc': array([0.97602919, 0.98322473, 0.9792014 , 0.97384672, 0.97650491,
        0.95752732, 0.98698317, 0.97642937, 0.97007243, 0.96770631])}

In [7]:
# Save model
joblib.dump(random_forest_model, 'train_delay_clf_v2.pkl')

['train_delay_clf_v2.pkl']