In [45]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import joblib

In [46]:
X_train = pd.read_csv("data/processed/X_train_res.csv")
y_train = pd.read_csv("data/processed/y_train_res.csv").squeeze()
X_test = pd.read_csv("data/processed/X_test.csv")
y_test = pd.read_csv("data/processed/y_test.csv").squeeze()

# Labels shifted to 0,1,2
y_train = y_train - 1
y_test = y_test - 1

y_train = y_train.astype(int)
y_test = y_test.astype(int)


print("Shapes:", X_train.shape, y_train.shape)
print("Training class distribution:\n", y_train.value_counts())

X_test = X_test[X_train.columns]


Shapes: (3978, 21) (3978,)
Training class distribution:
 NSP
2    1326
0    1326
1    1326
Name: count, dtype: int64


Random Forest

In [47]:
print("Train classes:", sorted(y_train.unique()))
print("Test classes:", sorted(y_test.unique()))

Train classes: [np.int64(0), np.int64(1), np.int64(2)]
Test classes: [np.int64(0), np.int64(1), np.int64(2)]


In [48]:
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Precision, Recall, F1 (macro)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# ROC AUC (multiclass, one-vs-rest)
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
roc_auc = roc_auc_score(y_test_bin, y_prob, average='macro', multi_class='ovr')

# Print results
print("Random Forest Performance Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

Random Forest Performance Metrics:
Accuracy:  0.9437
Precision: 0.8992
Recall:    0.8992
F1-score:  0.8992
ROC AUC:   0.9871


XG Boost

In [49]:
y_pred = best_xgb.predict(X_test)
y_prob = best_xgb.predict_proba(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Precision, Recall, F1 (macro)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# ROC AUC (multiclass, one-vs-rest)
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
roc_auc = roc_auc_score(y_test_bin, y_prob, average='macro', multi_class='ovr')

# Print results
print("XGBoost Performance Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC AUC:   {roc_auc:.4f}")

XGBoost Performance Metrics:
Accuracy:  0.9366
Precision: 0.8865
Recall:    0.8823
F1-score:  0.8836
ROC AUC:   0.9856


In [50]:
import joblib

# Load your saved models
rf_model = joblib.load("models/random_forest.joblib")
xgb_model = joblib.load("models/xgboost.joblib")

loaded_models = {
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}


results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    results[name] = {"Accuracy": acc, "F1_macro": f1}

print("\nModel Comparison Summary:")
for name, res in results.items():
    print(f"{name}: Accuracy = {res['Accuracy']:.3f}, F1_macro = {res['F1_macro']:.3f}")

pd.DataFrame(results).T.to_csv("results/tree_model_comparison.csv")


Model Comparison Summary:
Random Forest: Accuracy = 0.944, F1_macro = 0.899
XGBoost: Accuracy = 0.937, F1_macro = 0.884
