In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score, fbeta_score, precision_score, recall_score, confusion_matrix
import joblib

In [2]:
# Load dataset
data = pd.read_csv("../dataset/trains_refined_classification.csv")

# Feature extraction
X = data[['train_ID', 'departure_time', 'arrival_time', 'train_type']]
y = data['delay']

In [3]:
# Split datastes in trining and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Random forest with 50 trees
random_forest_model = RandomForestClassifier(n_estimators=50)
random_forest_model.fit(X_train, y_train)

# Model predictions
y_pred = random_forest_model.predict(X_test)

In [5]:
# Results
print("--------------------- Random forest ---------------------\n")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F2 score:", fbeta_score(y_test, y_pred, beta=2, average="micro"), "micro", fbeta_score(y_test, y_pred, beta=2, average="macro"), "macro", fbeta_score(y_test, y_pred, beta=2, average="weighted"), "weighted")
print("ROC AUC score:", roc_auc_score(y_test, y_pred))
print("\n---------------------------------------------------------")

--------------------- Random forest ---------------------

Accuracy: 0.9255213996243946
Precision: 0.9449986873195064
Recall: 0.8687100277543139
F2 score: 0.9255213996243946 micro 0.9186038191427358 macro 0.9250500826529892 weighted
ROC AUC score: 0.9168192308353893

---------------------------------------------------------


In [6]:
# Confusion matrx
confusion = confusion_matrix(y_test, y_pred)
print(confusion)

[[11528   419]
 [ 1088  7199]]


In [7]:
cross_validate(random_forest_model, X, y, cv=10, scoring=['f1', 'roc_auc'])

{'fit_time': array([6.93908572, 6.90597916, 6.94295144, 6.98451948, 6.9846313 ,
        7.07496667, 7.28123546, 7.04949903, 7.26603961, 7.12750983]),
 'score_time': array([0.19523478, 0.19174576, 0.19562292, 0.19735718, 0.19716215,
        0.19341469, 0.20243382, 0.1992712 , 0.20359635, 0.19560719]),
 'test_f1': array([0.90732547, 0.91954312, 0.90710111, 0.90574484, 0.92150429,
        0.90653728, 0.93106468, 0.90443126, 0.88847352, 0.88969346]),
 'test_roc_auc': array([0.97570834, 0.98274887, 0.97906408, 0.97367173, 0.97657383,
        0.95809535, 0.98642389, 0.97617969, 0.96977215, 0.96742599])}

In [8]:
# Save model
joblib.dump(random_forest_model, 'train_delay_clf_v2.pkl')

['train_delay_clf_v2.pkl']