# Explore here

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer, SimpleImputer
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import *
from imblearn.metrics import specificity_score
from sklearn.tree import DecisionTreeClassifier,plot_tree,export_text
from sklearn.ensemble import RandomForestClassifier
import missingno as msno
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv",sep=',')
data.to_csv('/workspaces/Forest_true_joel/data/raw/diabetes.csv',index=False)
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


en Este caso ya habiamos hecho el Eda en el ejercicio anterior en este caso voy a imputar todos los datos a ver la diferencia con el otro modelo aunque ya sabemos que en un ejercicio normal estos datos no nos servirian para el modelo 

In [3]:
# convertimos en nulos los que tienen valor 0 

data[['Glucose','Insulin','BloodPressure','SkinThickness','BMI']] = data[['Glucose','Insulin','BloodPressure','SkinThickness','BMI']].replace(0, np.nan)


In [4]:
# representamos los valores nulos 
data.isnull().mean()*100


Pregnancies                  0.000000
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
Outcome                      0.000000
dtype: float64

In [5]:
#imputamos los elementos nulos 
imputador_knn = KNNImputer(n_neighbors=3)
data[['SkinThickness','BloodPressure','BMI','Insulin']] = imputador_knn.fit_transform(data[['SkinThickness','BloodPressure','BMI','Insulin']])


In [6]:
# eliminamos las filas que quedan 
data.dropna(inplace=True)

In [7]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# realizamos el modelo
model = RandomForestClassifier(n_estimators = 60, random_state = 42)
model.fit(X_train, y_train)

In [10]:
# hacemos las predicciones 
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [11]:
def get_metrics(y_train, y_test, y_pred_train, y_pred_test):
    # Calcular métricas para el conjunto de entrenamiento
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_f1 = f1_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_pred_train)
    train_precision = precision_score(y_train, y_pred_train)
    train_recall = recall_score(y_train, y_pred_train)
    train_specificity = specificity_score(y_train, y_pred_train)

    # Calcular métricas para el conjunto de prueba
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_specificity = specificity_score(y_test, y_pred_test)

    # Calcular la diferencia entre métricas de entrenamiento y prueba
    diff_accuracy = train_accuracy - test_accuracy
    diff_f1 = train_f1 - test_f1
    diff_auc = train_auc - test_auc
    diff_precision = train_precision - test_precision
    diff_recall = train_recall - test_recall
    diff_specificity = train_specificity - test_specificity

    # Crear un DataFrame con los resultados
    metrics_df = pd.DataFrame([[train_accuracy, train_f1, train_auc, train_precision, train_recall, train_specificity],[test_accuracy, test_f1, test_auc, test_precision, test_recall, test_specificity],[diff_accuracy, diff_f1, diff_auc, diff_precision, diff_recall, diff_specificity]],
                              columns = ['Accuracy', 'F1', 'AUC', 'Precision', 'Recall', 'Specificity'],
                              index = ['Train','Test', 'Diferencia'])

    return metrics_df

In [12]:
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,1.0,1.0,1.0,1.0,1.0,1.0
Test,0.771242,0.623656,0.725893,0.644444,0.604167,0.847619
Diferencia,0.228758,0.376344,0.274107,0.355556,0.395833,0.152381


In [13]:
#buscamos los mejores paremetros para que el modelo no memorice
hyperparams = {
    "n_estimators":[10,30,50,70,90],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}
grid = GridSearchCV(model, hyperparams, scoring = "precision", cv = 10)

In [14]:
grid.fit(X_train, y_train)

In [15]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [16]:
# volvemos a hacer el modelo ahora con los mejores parametros 
model = RandomForestClassifier(n_estimators = 10,criterion='gini', max_depth = 5, min_samples_leaf=1,min_samples_split=2, random_state = 42)
model.fit(X_train, y_train)

In [17]:
# hacemos las predicciones 
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

In [18]:
get_metrics(y_train, y_test, train_pred, test_pred)

Unnamed: 0,Accuracy,F1,AUC,Precision,Recall,Specificity
Train,0.836066,0.748744,0.802202,0.827778,0.683486,0.920918
Test,0.777778,0.645833,0.741964,0.645833,0.645833,0.838095
Diferencia,0.058288,0.10291,0.060238,0.181944,0.037653,0.082823
