# Explore here

In [27]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import *
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint
import time
import joblib


In [2]:
# We load the data we previously cleaned.
test_data = pd.read_csv(r"D:\01A-TRABAJO\PYTHON\DATASCIENCE\EJERCICIOS\DECISIONTREE\DecisionTreeProject\data\processed\clean_test.csv")
train_data = pd.read_csv(r"D:\01A-TRABAJO\PYTHON\DATASCIENCE\EJERCICIOS\DECISIONTREE\DecisionTreeProject\data\processed\clean_train.csv")


In [3]:
test_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,98.0,58.0,190.0,58.0,0.43,43.0,0
1,2.0,112.0,75.0,0.0,75.0,0.148,21.0,0
2,2.0,108.0,64.0,0.0,64.0,0.158,21.0,0
3,8.0,107.0,80.0,0.0,80.0,0.856,34.0,0
4,7.0,136.0,90.0,0.0,90.0,0.21,50.0,0


In [4]:
# We specify the X and the y for the model
X_train = train_data.drop('Outcome', axis=1)
y_train = train_data['Outcome']
X_test = test_data.drop('Outcome', axis=1)
y_test = test_data['Outcome']

We will perform two filters, a wider randomized search and then a more concentrated grid search to aim to get the best results for our model.

In [5]:
# We define a wide parameter distribution for RandomizedSearchCV
param_dist = {
    'max_depth': randint(1, 20),
    'learning_rate': uniform(0.01, 0.5),
    'n_estimators': randint(50, 1000),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 5),
    'reg_alpha': uniform(0, 10),
    'reg_lambda': uniform(0, 10)
}

In [6]:
xgb = XGBClassifier(random_state=42)
random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)

In [7]:
start_time = time.time()
random_search.fit(X_train, y_train)
print(f"RandomizedSearchCV took {time.time() - start_time:.2f} seconds")
print("-"*40)
print("Best RandomizedSearchCV parameters:")
print(random_search.best_params_)
print(f"Best RandomizedSearchCV accuracy: {random_search.best_score_:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
RandomizedSearchCV took 9.31 seconds
----------------------------------------
Best RandomizedSearchCV parameters:
{'colsample_bytree': np.float64(0.9512764533397833), 'gamma': np.float64(2.5262618622392856), 'learning_rate': np.float64(0.4232287330538708), 'max_depth': 13, 'min_child_weight': 3, 'n_estimators': 226, 'reg_alpha': np.float64(0.534852967473689), 'reg_lambda': np.float64(9.585414968831984), 'subsample': np.float64(0.9235715720477948)}
Best RandomizedSearchCV accuracy: 0.7638


In [8]:
best_params = random_search.best_params_
param_grid = {
    'max_depth': [max(1, best_params['max_depth'] - 1), best_params['max_depth'], min(20, best_params['max_depth'] + 1)],
    'learning_rate': [max(0.01, best_params['learning_rate'] - 0.01), best_params['learning_rate'], min(0.5, best_params['learning_rate'] + 0.01)],
    'n_estimators': [max(50, best_params['n_estimators'] - 50), best_params['n_estimators'], min(1000, best_params['n_estimators'] + 50)],
    'min_child_weight': [max(1, best_params['min_child_weight'] - 1), best_params['min_child_weight'], min(10, best_params['min_child_weight'] + 1)],
    'subsample': [max(0.5, best_params['subsample'] - 0.1), best_params['subsample'], min(1, best_params['subsample'] + 0.1)],
    'colsample_bytree': [max(0.5, best_params['colsample_bytree'] - 0.1), best_params['colsample_bytree'], min(1, best_params['colsample_bytree'] + 0.1)],
    'gamma': [max(0, best_params['gamma'] - 0.1), best_params['gamma'], min(5, best_params['gamma'] + 0.1)],
    'reg_alpha': [max(0, best_params['reg_alpha'] - 1), best_params['reg_alpha'], min(10, best_params['reg_alpha'] + 1)],
    'reg_lambda': [max(0, best_params['reg_lambda'] - 1), best_params['reg_lambda'], min(10, best_params['reg_lambda'] + 1)]
}

In [9]:
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

start_time = time.time()
grid_search.fit(X_train, y_train)
print(f"GridSearchCV took {time.time() - start_time:.2f} seconds")

print("\nBest GridSearchCV parameters:")
print(grid_search.best_params_)
print(f"Best GridSearchCV accuracy: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 19683 candidates, totalling 98415 fits
GridSearchCV took 514.53 seconds

Best GridSearchCV parameters:
{'colsample_bytree': 1, 'gamma': np.float64(2.5262618622392856), 'learning_rate': np.float64(0.4132287330538708), 'max_depth': 12, 'min_child_weight': 3, 'n_estimators': 176, 'reg_alpha': np.float64(0.534852967473689), 'reg_lambda': np.float64(9.585414968831984), 'subsample': np.float64(0.8235715720477949)}
Best GridSearchCV accuracy: 0.7785


In [10]:
# Final model evaluation
best_model = grid_search.best_estimator_
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = best_model.score(X_test, y_test)

print(f"\nFinal model training accuracy: {train_accuracy:.4f}")
print(f"Final model test accuracy: {test_accuracy:.4f}")


Final model training accuracy: 0.7964
Final model test accuracy: 0.7727


In this exercise we were asked to look for the best accuracy, we will proceed by generating the XGB classifier and perform a grid search.

In [22]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

In [23]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [24]:
test_pred = best_model.predict(X_test)
train_pred = best_model.predict(X_train)

In [25]:
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.796417,0.680307,0.756097,0.747191,0.624413,0.887781
Test,0.772727,0.666667,0.742424,0.7,0.636364,0.848485
Diferencia,0.02369,0.01364,0.013673,0.047191,-0.01195,0.039296


We will proceed by saving the model

In [28]:
models_dir = os.path.join('..', 'models')

model_path = os.path.join(models_dir, 'xgboost_model.joblib')


joblib.dump(best_model, model_path)

['..\\models\\xgboost_model.joblib']