# 06 – Final XGB & Hybrid Ensemble

In [None]:
pip install xgboost scikit-learn tqdm matplotlib torch

In [None]:
import os
import joblib
import numpy as np
import torch
import matplotlib.pyplot as plt
import gc
import xgboost

from xgboost import XGBClassifier, plot_importance
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, ConfusionMatrixDisplay
)
from scipy.sparse import issparse, csr_matrix, vstack

%matplotlib inline

In [None]:
print(f"XGBoost version: {xgboost.__version__}")

In [None]:
# Paths
BASE_DIR = "C:/Users/indur/OneDrive - University of Westminster/GitHub/FYP_Project/Models/Review_Score/OutPuts"
XY_P = f"{BASE}/Xy_data.pkl"
BEST_XGB_P = f"{BASE}/best_xgb_params.pkl"
TEST_PROB_P = f"{BASE}/test_probs.pkl"
MODEL_P = os.path.join(BASE_DIR, "xgb_hybrid_final.pkl")

os.makedirs(BASE_DIR, exist_ok=True)

In [None]:
# Load Data
X_train, y_train, X_val, y_val, X_test, y_test = joblib.load(XY_P)
best_params = joblib.load(BEST_XGB_P)
test_probs = joblib.load(TEST_PROB_P)

# Convert to sparse if not already
if not issparse(X_train):
    X_train = csr_matrix(X_train)
    X_val = csr_matrix(X_val)
    X_test = csr_matrix(X_test)

X_full = vstack([X_train, X_val])
y_full = np.concatenate([y_train, y_val])

print(f"Training samples: {X_full.shape[0]}, Features: {X_full.shape[1]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"X_train is {'sparse' if issparse(X_train) else 'dense'}")

gc.collect()

In [None]:
# Check for GPU
gpu_available = torch.cuda.is_available()
tree_method = "gpu_hist" if gpu_available else "hist"
predictor = "gpu_predictor" if gpu_available else "cpu_predictor"
print(f"Using {'GPU' if gpu_available else 'CPU'}, tree_method={tree_method}, predictor={predictor}")

In [None]:
# Early Stopping Training
early_stopping_model = XGBClassifier(
    **best_params,
    objective="multi:softprob",
    num_class=len(np.unique(y_train)),
    use_label_encoder=False,
    eval_metric=["mlogloss", "merror"],
    tree_method=tree_method,
    predictor=predictor,
    seed=42,
    n_estimators=100
)

early_stopping_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=10,
    verbose=True
)

best_iteration = early_stopping_model.best_iteration
print(f"Best iteration: {best_iteration}")
results = early_stopping_model.evals_result_

In [None]:
# Plot Validation Metrics
plt.figure(figsize=(10, 6))
plt.plot(results['validation_0']['mlogloss'], label='Validation Log Loss')
plt.plot(results['validation_0']['merror'], label='Validation Error')
plt.axvline(x=best_iteration, color='r', linestyle='--', label=f'Best Iteration ({best_iteration})')
plt.xlabel('Iteration')
plt.ylabel('Metric')
plt.title('Validation Metrics')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
del early_stopping_model, results
gc.collect()

In [None]:
# Train Final Model
final_model = XGBClassifier(
    **best_params,
    objective="multi:softprob",
    num_class=len(np.unique(y_full)),
    use_label_encoder=False,
    eval_metric="mlogloss",
    tree_method=tree_method,
    predictor=predictor,
    seed=42,
    n_estimators=best_iteration
)

final_model.fit(X_full, y_full, verbose=False)
joblib.dump(final_model, MODEL_P)
print(f"Final model saved to: {MODEL_P}")
print(f"Boosted rounds: {final_model.get_booster().num_boosted_rounds()}")

del X_full, y_full
gc.collect()

In [None]:
# Ensemble Prediction
batch_size = 1000
xgb_probs = np.zeros((X_test.shape[0], len(np.unique(y_test))))
for i in range(0, X_test.shape[0], batch_size):
    end = min(i + batch_size, X_test.shape[0])
    xgb_probs[i:end] = final_model.predict_proba(X_test[i:end])

alpha = 0.6
final_probs = alpha * test_probs + (1 - alpha) * xgb_probs
y_pred = final_probs.argmax(axis=1)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score (weighted): {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

In [None]:
roc_auc = roc_auc_score(y_test, final_probs[:, 1])           
print(f"Test ROC AUC: {roc_auc:.4f}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=final_model.classes_)
disp = ConfusionMatrixDisplay(cm, display_labels=final_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()