In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, fbeta_score, precision_score, recall_score, confusion_matrix
import joblib

In [2]:
# Load dataset
data = pd.read_csv("../dataset/trains_refined_classification.csv")

# Feature extraction
X = data[['train_ID', 'departure_time', 'arrival_time', 'train_type']]
y = data['delay']

In [3]:
# Split datastes in trining and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Random forest with 50 trees
random_forest_model = RandomForestClassifier(n_estimators=50)
random_forest_model.fit(X_train, y_train)

# Model predictions
y_pred = random_forest_model.predict(X_test)

In [5]:
# Results
print("--------------------- Random forest ---------------------\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", fbeta_score(y_test, y_pred, beta=1))
print("ROC AUC score:", roc_auc_score(y_test, y_pred))
print("\n---------------------------------------------------------")

--------------------- Random forest ---------------------

Accuracy: 0.926213304339231
Precision: 0.9450995807127882
Recall: 0.8703994207795342
F1 score: 0.906212701802877
ROC AUC score: 0.9176639273479994

---------------------------------------------------------


In [6]:
# Confusion matrx
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

[[11528   419]
 [ 1074  7213]]


In [7]:
cross_validate(random_forest_model, X, y, cv=10, scoring=['f1', 'roc_auc'])

{'fit_time': array([10.03253269, 14.55844307, 16.50323009, 13.99523783,  9.87469673,
         9.12974143, 11.86954451,  8.75477767,  8.69973993,  9.09843254]),
 'score_time': array([0.38843155, 0.46927428, 0.45242572, 0.26992774, 0.25316906,
        0.22610116, 0.24831724, 0.23620391, 0.2379353 , 0.23342681]),
 'test_f1': array([0.90935961, 0.92130326, 0.90805311, 0.90683845, 0.92224323,
        0.9051636 , 0.93012772, 0.90872416, 0.88625888, 0.89020556]),
 'test_roc_auc': array([0.97562446, 0.98367385, 0.97918703, 0.97454989, 0.97646157,
        0.95838151, 0.9872735 , 0.97643318, 0.97053726, 0.96595438])}

In [8]:
# Save model
joblib.dump(random_forest_model, 'train_delay_clf_v2.pkl')

['train_delay_clf_v2.pkl']