In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
import joblib

In [None]:
X_train = pd.read_csv("data/processed/X_train_res.csv")
y_train = pd.read_csv("data/processed/y_train_res.csv").squeeze()
X_test = pd.read_csv("data/processed/X_test.csv")
y_test = pd.read_csv("data/processed/y_test.csv").squeeze()

# Labels shifted to 0,1,2
y_train = y_train - 1
y_test = y_test - 1

print("Shapes:", X_train.shape, y_train.shape)
print("Training class distribution:\n", y_train.value_counts())

(3978, 21) (3978,) (426, 21) (426,)
NSP
2.0    1326
0.0    1326
1.0    1326
Name: count, dtype: int64


Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Baseline Random Forest performance:")
print(classification_report(y_test, y_pred_rf, digits=3))

rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
}

rf_grid = GridSearchCV(
    rf, rf_param_grid,
    cv=3, scoring="f1_macro",
    n_jobs=-1, verbose=1
)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

print("Best Random Forest params:", rf_grid.best_params_)
y_pred_best_rf = best_rf.predict(X_test)
print("Tuned Random Forest performance:")
print(classification_report(y_test, y_pred_best_rf, digits=3))

joblib.dump(best_rf, "models/random_forest.joblib")

Random Forest Results:
              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       332
         1.0       0.82      0.83      0.82        59
         2.0       0.94      0.91      0.93        35

    accuracy                           0.95       426
   macro avg       0.91      0.90      0.91       426
weighted avg       0.95      0.95      0.95       426

Best RF params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}


XG Boost

In [32]:
xgb = XGBClassifier(random_state=42, eval_metric="mlogloss")
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("Baseline XGBoost performance:")
print(classification_report(y_test, y_pred_xgb, digits=3))

xgb_param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    xgb, xgb_param_dist,
    cv=3, n_iter=15,
    scoring="f1_macro",
    n_jobs=-1, verbose=1,
    random_state=42
)
xgb_search.fit(X_train, y_train)
best_xgb = xgb_search.best_estimator_

print("Best XGBoost params:", xgb_search.best_params_)
y_pred_best_xgb = best_xgb.predict(X_test)
print("Tuned XGBoost performance:")
print(classification_report(y_test, y_pred_best_xgb, digits=3))

joblib.dump(best_xgb, "models/xgboost.joblib")

Baseline XGB performance:
              precision    recall  f1-score   support

         0.0      0.967     0.973     0.970       332
         1.0      0.855     0.797     0.825        59
         2.0      0.865     0.914     0.889        35

    accuracy                          0.944       426
   macro avg      0.895     0.895     0.894       426
weighted avg      0.943     0.944     0.943       426

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best XGB params: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Tuned XGB performance:
              precision    recall  f1-score   support

         0.0      0.961     0.970     0.966       332
         1.0      0.833     0.763     0.796        59
         2.0      0.865     0.914     0.889        35

    accuracy                          0.937       426
   macro avg      0.886     0.882     0.884       426
weighted avg      0.936     0.937     0.936       426



['models/best_xgboost.joblib']

In [33]:
models = {
    "Random Forest": best_rf,
    "XGBoost": best_xgb
}

results = {}
for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")
    results[name] = {"Accuracy": acc, "F1_macro": f1}

print("\nModel Comparison Summary:")
for name, res in results.items():
    print(f"{name}: Accuracy = {res['Accuracy']:.3f}, F1_macro = {res['F1_macro']:.3f}")

pd.DataFrame(results).T.to_csv("results/tree_model_comparison.csv")


Model Comparison Summary:
Random Forest: Accuracy = 0.944, F1_macro = 0.899
XGBoost: Accuracy = 0.937, F1_macro = 0.884
