# Explore here

In [53]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import pickle


train_data = pd.read_csv("../data/processed/clean_train.csv")
test_data = pd.read_csv("../data/processed/clean_test.csv")

train_data.head()

Unnamed: 0,Pregnancies,Glucose,BMI,Age,Outcome
0,3.0,111.0,30.1,30.0,0
1,2.0,98.0,34.7,22.0,0
2,4.0,131.0,33.1,28.0,0
3,8.0,120.0,25.0,64.0,0
4,10.0,108.0,32.4,42.0,1


In [54]:
X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

In [55]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [56]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0])

In [57]:
accuracy_score(y_test, y_pred)

0.7702702702702703

In [58]:
pickle.dump(model, open("../models/model_orig.sav", "wb"))

# Optimizacion

In [59]:
# Importar libreria
from sklearn.model_selection import GridSearchCV

xg_grid = {"n_estimators": [250,350,400],
          "max_depth": [10,12],
          "learning_rate": [0.01, 0.1],
          "gamma": [0, 0.1],
          "alpha": [0.01,0.1]}

# Búsqueda por validación cruzada para XGBclassifier

xg_model_grid = GridSearchCV(XGBClassifier(),
                              param_grid=xg_grid,
                              scoring    = 'neg_root_mean_squared_error',
                              cv=5,
                              n_jobs = -1,
                              verbose=True)

xg_model_grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [60]:
xg_model_grid.best_params_

{'alpha': 0.1,
 'gamma': 0.1,
 'learning_rate': 0.01,
 'max_depth': 10,
 'n_estimators': 250}

In [61]:
xgb_ideal_model_g = XGBClassifier(n_estimators = 250,
                  max_depth = 10,
                  learning_rate = 0.01,
                  gamma = 0.1,
                  alpha = 0.1)
xgb_ideal_model_g.fit(X_train, y_train)

In [66]:
y_pred_gd = xgb_ideal_model_g.predict(X_test)
y_pred_gd

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0])

In [67]:
print(f"Error cuadrático medio: {mean_squared_error(y_test, y_pred_gd)}")
print(f"Raíz del Error cuadrático medio: {np.sqrt(mean_squared_error(y_test, y_pred_gd))}")
print(f"Coeficiente de determinación: {r2_score(y_test, y_pred_gd)}")
print(accuracy_score(y_test, y_pred_gd))

Error cuadrático medio: 0.22972972972972974
Raíz del Error cuadrático medio: 0.4793012932694108
Coeficiente de determinación: -0.037311894454751915
0.7702702702702703


In [64]:
y_pred_train = xgb_ideal_model_g.predict(X_train)

In [65]:
print(f"Error cuadrático medio: {mean_squared_error(y_train, y_pred_train)}")
print(f"Raíz del Error cuadrático medio: {np.sqrt(mean_squared_error(y_train, y_pred_train))}")
print(f"Coeficiente de determinación: {r2_score(y_train, y_pred_train)}")

Error cuadrático medio: 0.06756756756756757
Raíz del Error cuadrático medio: 0.25993762245501817
Coeficiente de determinación: 0.7028671811280507
