In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, fbeta_score, precision_score, recall_score, confusion_matrix
import joblib

In [10]:
# Load dataset
data = pd.read_csv("../dataset/trains_refined.csv")

# Feature extraction
X = data[['train_ID', 'departure_time', 'arrival_time', 'train_type']]
y = data['delay']

In [11]:
# Split datastes in trining and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Random forest with 50 trees
random_forest_model = RandomForestClassifier(n_estimators=50)
random_forest_model.fit(X_train, y_train)

# Model predictions
y_pred = random_forest_model.predict(X_test)

In [13]:
# Results
print("--------------------- Random forest ---------------------\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F2 score:", fbeta_score(y_test, y_pred, beta=2, average="micro"), "micro", fbeta_score(y_test, y_pred, beta=2, average="macro"), "macro", fbeta_score(y_test, y_pred, beta=2, average="weighted"), "weighted")
print("ROC AUC score:", roc_auc_score(y_test, y_pred))
print("\n---------------------------------------------------------")

--------------------- Random forest ---------------------

Accuracy: 0.9242858554907581
Precision: 0.9441157133464826
Recall: 0.8664172800772294
F2 score: 0.9242858554907581 micro 0.9172281545019856 macro 0.9237962956784775 weighted
ROC AUC score: 0.9154217479318096

---------------------------------------------------------


In [14]:
# Confusion matrx
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

[[11522   425]
 [ 1107  7180]]


In [15]:
cross_validate(random_forest_model, X, y, cv=10, scoring=['f1', 'roc_auc'])

{'fit_time': array([7.30623531, 6.97741127, 7.02354503, 6.92211175, 7.18421555,
        6.89667821, 7.05396128, 7.04911041, 6.78463674, 6.82973433]),
 'score_time': array([0.1942842 , 0.20638943, 0.20322347, 0.22738576, 0.20645547,
        0.20028424, 0.2013073 , 0.19528723, 0.20455098, 0.19861484]),
 'test_f1': array([0.91090327, 0.92176743, 0.90633916, 0.90794201, 0.92179617,
        0.90410959, 0.93037816, 0.9077584 , 0.88522954, 0.88770828]),
 'test_roc_auc': array([0.97606461, 0.9825439 , 0.97957948, 0.97416094, 0.97731485,
        0.95835998, 0.98672215, 0.97586463, 0.96866326, 0.96632491])}

In [16]:
# Save model
joblib.dump(random_forest_model, 'train_delay_clf_v2.pkl')

['train_delay_clf_v2.pkl']