## XGBoost Model

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.tree import plot_tree
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score
from sklearn.model_selection import GridSearchCV
from pickle import dump

In [19]:
train_data = pd.read_csv("data/clean/clean_train.csv")
test_data = pd.read_csv("data/clean/clean_test.csv")

x_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
x_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

x_train.head(10)

Unnamed: 0,Age,BMI,Glucose,Insulin,DiabetesPedigreeFunction
0,26,23.1,109,120,0.407
1,21,30.8,100,50,0.597
2,25,42.0,180,0,1.893
3,23,35.1,129,270,0.231
4,21,30.4,84,76,0.968
5,32,30.0,125,120,0.464
6,46,46.1,144,180,0.335
7,30,29.7,117,0,0.38
8,55,21.1,128,0,0.268
9,27,32.3,125,0,0.536


In [20]:
model = XGBClassifier(random_state = 42)
model.fit(x_train, y_train)

In [21]:
# Make predictions on the test set
y_pred = model.predict(x_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
report = classification_report(y_test, y_pred)
print(report)

#SAVING THE MODEL
dump(model, open("models/boosting_classifier_default_42.sav", "wb"))

Accuracy: 0.73
              precision    recall  f1-score   support

           0       0.77      0.78      0.78        91
           1       0.66      0.65      0.66        60

    accuracy                           0.73       151
   macro avg       0.72      0.72      0.72       151
weighted avg       0.73      0.73      0.73       151



## Optimization

In [22]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],       # Number of trees
    'learning_rate': [0.01, 0.5, 0.8],    # Step size shrinkage
    'max_depth': [3, 5],               # Maximum tree depth
    'subsample': [0.6, 1.0],         # Fraction of samples used per tree
    'colsample_bytree': [0.6, 0.8, 1.0],  # Fraction of features used per tree
    'gamma': [0, 0.1, 0.2],               # Minimum loss reduction
    'min_child_weight': [3, 5]         # Minimum sum of instance weight for child nodes
}


grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)
grid_search.fit(x_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated score: {grid_search.best_score_}")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


Best parameters: {'colsample_bytree': 1.0, 'gamma': 0.2, 'learning_rate': 0.5, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 1.0}
Best cross-validated score: 0.7703994490358126


In [23]:
best_xgb = grid_search.best_estimator_
test_accuracy = best_xgb.score(x_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.7350993377483444


In [29]:
model = XGBClassifier(n_estimators= grid_search.best_params_['n_estimators'], max_depth = grid_search.best_params_['max_depth'],\
                                subsample = grid_search.best_params_['subsample'],\
                                      colsample_bytree = grid_search.best_params_['colsample_bytree'],\
                                            gamma= grid_search.best_params_['gamma'],\
                                                min_child_weight= grid_search.best_params_['min_child_weight'], random_state = 42)
model.fit(x_train, y_train)

In [30]:
y_pred = model.predict(x_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

# Detailed classification report
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.80      0.78      0.79        91
           1       0.68      0.70      0.69        60

    accuracy                           0.75       151
   macro avg       0.74      0.74      0.74       151
weighted avg       0.75      0.75      0.75       151



In [31]:
best_hyperparams_string = f"_n_estimators-{grid_search.best_params_['n_estimators']}_max_depth-{grid_search.best_params_['max_depth']}_subsample-{grid_search.best_params_['subsample']}_colsample_bytree-{grid_search.best_params_['colsample_bytree']}_gamma-{grid_search.best_params_['gamma']}_min_child_weight-{grid_search.best_params_['min_child_weight']}"


dump(model, open(f"models/xgboost_classifier_{best_hyperparams_string}_42.sav", "wb"))