In [14]:
# --- 1. Install Libraries ---


import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier # <-- Import XGBoost
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, classification_report
import pandas as pd

# --- 2. Load and Prepare Data ---
# --- 1. LOAD PREPROCESSED DATA ---
print("Loading preprocessed data...")

# Load your saved files
train_df = pd.read_csv("../data/train_processed.csv")
test_df = pd.read_csv("../data/test_processed.csv")

# --- 2. RE-CREATE X AND y ---
# Re-split the data back into features (X) and target (y)
X_train = train_df.drop(columns=["at_risk"])
y_train = train_df["at_risk"]

X_test = test_df.drop(columns=["at_risk"])
y_test = test_df["at_risk"]

print("Data loaded and split successfully!")




# --- 3. Prepare Data for BOTH Methods ---

# Method 1: Create SMOTE'd data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# Method 2: Calculate scale_pos_weight for imbalanced data
# scale_pos_weight = count(negative_class) / count(positive_class)
counts = y_train.value_counts()
scale_pos_weight = counts[0] / counts[1]
print(f"Calculated scale_pos_weight: {scale_pos_weight:.2f}")


# --- 4. Train and Evaluate Models ---

# Model 1: Your Best Random Forest (for comparison)
# (Assuming 'best_rf_model' is your tuned RF from GridSearchCV)
# best_rf_model.fit(X_train_smote, y_train_smote)
# y_pred_rf = best_rf_model.predict(X_test)
# print("\n--- Champion RF Model (SMOTE + Tuned) ---")
# print(classification_report(y_test, y_pred_rf, target_names=["Passed", "At-Risk"]))


# Model 2: XGBoost with SMOTE (Apples-to-Apples)
print("\n--- Training XGBoost with SMOTE data... ---")
xgb_smote = XGBClassifier(random_state=42, n_jobs=-1)
xgb_smote.fit(X_train_smote, y_train_smote)
y_pred_xgb_smote = xgb_smote.predict(X_test)

print("\n--- XGBoost Model (SMOTE) ---")
print(classification_report(y_test, y_pred_xgb_smote, target_names=["Passed", "At-Risk"]))


# Model 3: XGBoost with scale_pos_weight (The "Pro" Method)
print("\n--- Training XGBoost with scale_pos_weight on original data... ---")
xgb_scaled = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight  # <-- Tells XGBoost to handle imbalance
)
xgb_scaled.fit(X_train, y_train) # <-- Fit on the *original* imbalanced data
y_pred_xgb_scaled = xgb_scaled.predict(X_test)


print("\n--- XGBoost Model (scale_pos_weight) ---")
print(classification_report(y_test, y_pred_xgb_scaled, target_names=["Passed", "At-Risk"]))

Loading preprocessed data...
Data loaded and split successfully!
Calculated scale_pos_weight: 1.00

--- Training XGBoost with SMOTE data... ---

--- XGBoost Model (SMOTE) ---
              precision    recall  f1-score   support

      Passed       0.88      0.88      0.88       165
     At-Risk       0.33      0.33      0.33        30

    accuracy                           0.79       195
   macro avg       0.61      0.61      0.61       195
weighted avg       0.79      0.79      0.79       195


--- Training XGBoost with scale_pos_weight on original data... ---

--- XGBoost Model (scale_pos_weight) ---
              precision    recall  f1-score   support

      Passed       0.88      0.88      0.88       165
     At-Risk       0.33      0.33      0.33        30

    accuracy                           0.79       195
   macro avg       0.61      0.61      0.61       195
weighted avg       0.79      0.79      0.79       195

