<a href="https://colab.research.google.com/github/leahhkim/final_project_analyticsII/blob/main/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#XG Boost

import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#Loading the data, precleaned/encoded
train_df = pd.read_csv(
    "https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_train.csv",
    low_memory=False
)
val_df = pd.read_csv(
    "https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_val.csv",
    low_memory=False
)
test_df = pd.read_csv(
    "https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_test.csv",
    low_memory=False
)

# 2) Split into X/y, infant_death is our target
y_train = train_df["infant_death"].astype(int).to_numpy()
X_train = train_df.drop(columns=["infant_death"], errors="ignore")

y_val = val_df["infant_death"].astype(int).to_numpy()
X_val = val_df.drop(columns=["infant_death"], errors="ignore")

y_test = test_df["infant_death"].astype(int).to_numpy()
X_test = test_df.drop(columns=["infant_death"], errors="ignore")


#Class imbalance handling
#infant death is rare, so we weight the positive class more heavily
#scale_pos_weight is a common baseline for imbalanced classification
pos = np.sum(y_train == 1)
neg = np.sum(y_train == 0)
scale_pos_weight = neg / pos

#Pipeline
#We use a pipeline so that any preprocessing would occur inside CV folds, preventing data leakage
#In this case, pipeline contains only the model because the dataset is already fully numeric with no missing values
pipe = Pipeline([
    ("xgb", XGBClassifier(
        tree_method="hist",
        device="cuda",          # change to "cpu" if you don't have a GPU runtime
        eval_metric="logloss",
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1
    ))
])

# cross validation and hyperparameter tuning
#StratifiedKFold preserves the infant_death rate in each fold
# RandomizedSearchCV tries multiple hyperparameter combinations and chooses the best by F1 score.

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Hyperparameter search space
#n_estimators is number of boosting rounds/trees
#max tree depth (higher = more complex, risk overfitting)
#learning_rate: step size (lower often needs more trees)
#subsample/colsample_bytree: randomness to reduce overfitting
param_dist = {
    "xgb__n_estimators": [200, 300, 500],
    "xgb__max_depth": [3, 5, 7],
    "xgb__learning_rate": [0.03, 0.05, 0.1],
    "xgb__subsample": [0.8, 1.0],
    "xgb__colsample_bytree": [0.8, 1.0],
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=10,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

#fit the search on training data only
search.fit(X_train, y_train)
print("\nBest params:", search.best_params_)

#choosing the best model
best_model = search.best_estimator_

#evaluate on validation set
#we report multiple metrics: confusion matrix and precision/recall/F1 and accuracy
#accuracy can be misleading however for rare events
val_pred = best_model.predict(X_val)
print("\nValidation")
print(confusion_matrix(y_val, val_pred))
print(classification_report(y_val, val_pred, digits=4))
print("VAL accuracy:", accuracy_score(y_val, val_pred))

#evaluate on test set
#the test set is only used once at the end, after model selection
test_pred = best_model.predict(X_test)
print("\nTest")
print(confusion_matrix(y_test, test_pred))
print(classification_report(y_test, test_pred, digits=4))
print("TEST accuracy:", accuracy_score(y_test, test_pred))


Fitting 3 folds for each of 10 candidates, totalling 30 fits

Best params: {'xgb__subsample': 0.8, 'xgb__n_estimators': 200, 'xgb__max_depth': 7, 'xgb__learning_rate': 0.05, 'xgb__colsample_bytree': 0.8}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



Validation
[[661716  46989]
 [   968   3211]]
              precision    recall  f1-score   support

           0     0.9985    0.9337    0.9650    708705
           1     0.0640    0.7684    0.1181      4179

    accuracy                         0.9327    712884
   macro avg     0.5313    0.8510    0.5416    712884
weighted avg     0.9931    0.9327    0.9601    712884

VAL accuracy: 0.9327281857917978

Test
[[662229  46476]
 [   969   3210]]
              precision    recall  f1-score   support

           0     0.9985    0.9344    0.9654    708705
           1     0.0646    0.7681    0.1192      4179

    accuracy                         0.9334    712884
   macro avg     0.5316    0.8513    0.5423    712884
weighted avg     0.9931    0.9334    0.9605    712884

TEST accuracy: 0.9334463952059522
