# Explore here

In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer, SimpleImputer
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from imblearn.metrics import specificity_score
from sklearn.tree import DecisionTreeClassifier,plot_tree,export_text
import missingno as msno
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
# Ignorar las advertencias de convergencia
warnings.filterwarnings("ignore", category=ConvergenceWarning)
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv",sep=',')
data.to_csv('/workspaces/Boosting_joeld/data/raw/diabetes.csv',index=False)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [105]:
# convertimos en nulos los que tienen valor 0 
data[['Glucose','Insulin','BloodPressure','SkinThickness','BMI']] = data[['Glucose','Insulin','BloodPressure','SkinThickness','BMI']].replace(0, np.nan)


In [106]:
#imputamos los elementos nulos 
imputador_knn = KNNImputer(n_neighbors=3)
data[['SkinThickness','BloodPressure','BMI','Insulin']] = imputador_knn.fit_transform(data[['SkinThickness','BloodPressure','BMI','Insulin']])

In [107]:
# eliminamos las filas que quedan 
data.dropna(inplace=True)

In [108]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [109]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [110]:
model = GradientBoostingClassifier(n_estimators = 5, random_state = 42)

model.fit(X_train, y_train)

In [111]:
# hacemos las predicciones 
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [112]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [113]:
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.739344,0.453608,0.642448,0.90411,0.302752,0.982143
Test,0.771242,0.477612,0.652381,0.842105,0.333333,0.971429
Diferencia,-0.031898,-0.024004,-0.009933,0.062004,-0.030581,0.010714


en este caso podemos ver que lo mas llamativo es que tenemos un recall muy bajo es decir que el modelo está clasificando correctamente la mayoría de los positivas que identifica, pero está perdiendo un número significativo de ellos 

In [114]:
#buscamos los mejores paremetros para que el modelo no memorice
hyperparams = {
    "n_estimators":[5,10,15],
    "learning_rate": [0.001,0.01,0.1],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
grid = GridSearchCV(model, hyperparams, scoring = "precision", cv = 5)

In [115]:
grid.fit(X_train, y_train)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [116]:
grid.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 10}

In [118]:
model = GradientBoostingClassifier(learning_rate =0.1 ,max_depth = 5,min_samples_leaf = 1,min_samples_split=5,n_estimators =10, random_state = 42)

model.fit(X_train, y_train)

In [119]:
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [120]:
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.891803,0.829016,0.856768,0.952381,0.733945,0.979592
Test,0.771242,0.597701,0.708929,0.666667,0.541667,0.87619
Diferencia,0.120561,0.231314,0.14784,0.285714,0.192278,0.103401


ahora tenemos una mejor pinta en el modelo aunque aun no parece ser de lo mejor