In [4]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import fbeta_score, make_scorer

X_train = pd.read_csv('../data/processed/X_train_smote.csv')
y_train = pd.read_csv('../data/processed/y_train_smote.csv').values.ravel()
X_test = pd.read_csv('../data/processed/X_test.csv')
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()
print('X_train shape:', X_train.shape, 'y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape, 'y_test shape:', y_test.shape)
# Create the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [1, 50, 100]
}
# Randomized search cv
random_search = RandomizedSearchCV(estimator=xgb_clf, param_distributions=param_grid, n_iter=30, scoring=make_scorer(fbeta_score, beta=2), cv=StratifiedKFold(n_splits=5), verbose=1, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
print('Best hyperparameters:', random_search.best_params_)
print('Best F2 score on test set:', fbeta_score(y_test, random_search.predict(X_test), beta=2))
print('Best F1 score on test set:', fbeta_score(y_test, random_search.predict(X_test), beta=1))
# Comparing baseline with XgBoost from 04_modeling.ipynb
baseline_f2 = 0.45  # Replace with actual baseline F2 score
xgb_f2 = fbeta_score(y_test, random_search.predict(X_test), beta=2)
print(f'Baseline F2 Score: {baseline_f2:.4f}')
print(f'XGBoost F2 Score: {xgb_f2:.4f}')
if xgb_f2 > baseline_f2:
    print('XGBoost outperforms the baseline model.')
else:
    print('Baseline model outperforms XGBoost.')
# Saveing tuned model
joblib.dump(random_search.best_estimator_, '../models/xgb_tuned_model.pkl')




X_train shape: (453204, 33) y_train shape: (453204,)
X_test shape: (56746, 33) y_test shape: (56746,)
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters: {'subsample': 1.0, 'scale_pos_weight': 1, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.2, 'colsample_bytree': 0.6}
Best F2 score on test set: 0.8061002178649237
Best F1 score on test set: 0.8505747126436781
Baseline F2 Score: 0.4500
XGBoost F2 Score: 0.8061
XGBoost outperforms the baseline model.


['../models/xgb_tuned_model.pkl']