In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from scipy.stats import randint as sp_randint

seed = np.random.seed(2017)

df = pd.read_csv('datasets/diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [3]:
X = df.drop(columns='Outcome') # Variables independientes
y = df['Outcome'].values # Variable dependiente

In [4]:
X = StandardScaler().fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
kfold = StratifiedKFold(n_splits = 5, random_state=seed)

In [6]:
modelo_rf = RandomForestClassifier(random_state=seed).fit(X_train, y_train)
rf_parametros = {
    'n_estimators': [100, 250, 500, 750, 1000],
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
    'max_depth': [1, 3, 5, 7, 9]
}

In [7]:
#Búsqueda de mejores hiperparámetros óptimos usando GridSearch

grid = GridSearchCV(modelo_rf, rf_parametros, scoring='roc_auc', cv=kfold, verbose=10, n_jobs=-1)

In [8]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 113 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 189 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 233 tasks      | elapsed:  

[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=500, score=0.739, total=   0.8s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=250, score=0.837, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=750, score=0.830, total=   1.1s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500, score=0.776, total=   0.7s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000, score=0.780, total=   1.4s
[CV] criterion=gini, max_depth=3, max_features=None, n_estimators=250 
[CV]  criterion=gini, max_depth=3, max_features=None, n_estimators=250, score

[Parallel(n_jobs=-1)]: Done 653 tasks      | elapsed:   53.7s


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=250, score=0.744, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=100, score=0.839, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=100, score=0.782, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=100, score=0.822, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=250, score=0.779, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=750, score=0

[Parallel(n_jobs=-1)]: Done 690 tasks      | elapsed:   56.4s


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=100, score=0.870, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=750, score=0.720, total=   1.3s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=750, score=0.823, total=   1.2s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500, score=0.762, total=   0.7s
[CV] criterion=gini, max_depth=1, max_features=log2, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=log2, n_estimators=100, score=0.849, total=   0.1s
[CV] criterion=gini, max_depth=1, max_features=log2, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=log2, n_estimators=250, score=0

[Parallel(n_jobs=-1)]: Done 729 tasks      | elapsed:   58.7s


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=500, score=0.840, total=   0.8s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=100, score=0.771, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=250, score=0.744, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=750, score=0.757, total=   1.1s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=750, score=0.838, total=   1.1s
[CV] criterion=gini, max_depth=1, max_features=log2, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=log2, n_estimators=500, score=0

[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.0min


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=250, score=0.839, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=1000, score=0.738, total=   1.7s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=1000, score=0.786, total=   1.4s
[CV] criterion=gini, max_depth=1, max_features=log2, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=log2, n_estimators=250, score=0.783, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=log2, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=log2, n_estimators=1000, score=0.823, total=   1.4s
[CV] criterion=gini, max_depth=3, max_features=None, n_estimators=1000 
[CV]  criterion=gini, max_depth=3, max_features=None, n_estimators=1000,

[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  1.1min


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=500, score=0.707, total=   0.8s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=250, score=0.751, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=500, score=0.782, total=   0.8s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=100, score=0.748, total=   0.1s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=500, score=0.827, total=   0.7s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000, score

[Parallel(n_jobs=-1)]: Done 850 tasks      | elapsed:  1.1min


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=100, score=0.684, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=1000, score=0.840, total=   1.7s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=1000, score=0.763, total=   1.5s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=1000, score=0.760, total=   1.4s
[CV] criterion=gini, max_depth=3, max_features=None, n_estimators=250 
[CV]  criterion=gini, max_depth=3, max_features=None, n_estimators=250, score=0.756, total=   0.4s
[CV] criterion=gini, max_depth=3, max_features=None, n_estimators=1000 
[CV]  criterion=gini, max_depth=3, max_features=None, n_estimators=1000,

[Parallel(n_jobs=-1)]: Done 893 tasks      | elapsed:  1.2min


[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=100, score=0.841, total=   0.2s
[CV] criterion=gini, max_depth=1, max_features=None, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=None, n_estimators=750, score=0.725, total=   1.3s
[CV] criterion=gini, max_depth=1, max_features=auto, n_estimators=500 
[CV]  criterion=gini, max_depth=1, max_features=auto, n_estimators=500, score=0.766, total=   0.8s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=100 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=100, score=0.809, total=   0.1s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=250 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=250, score=0.771, total=   0.4s
[CV] criterion=gini, max_depth=1, max_features=sqrt, n_estimators=750 
[CV]  criterion=gini, max_depth=1, max_features=sqrt, n_estimators=750, score=0

[Parallel(n_jobs=-1)]: Done 936 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  1.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 3, 5, 7, 9],
                         'max_features': [None, 'auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 250, 500, 750, 1000]},
             scoring='roc_auc', verbose=10)

In [9]:
print('Mejores parametros: ',grid.best_params_)

Mejores parametros:  {'criterion': 'entropy', 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 100}


In [10]:
resultados = cross_val_score(grid.best_estimator_, X_train, y_train, cv=kfold)

In [11]:
print("Precision - Train CV: ", resultados.mean())
print("Precision - Train : ", accuracy_score(grid.best_estimator_.predict(X_train), y_train))

Precision - Train CV:  0.7522672204915195
Precision - Train :  0.9236499068901304


In [13]:
print("Precision - Train : ", accuracy_score(grid.best_estimator_.predict(X_test), y_test))

Precision - Train :  0.7835497835497836


In [14]:
#Búsqueda de mejores hiperparámetros óptimos usando RandomSearch

param_dist = {'n_estimators': sp_randint(100,1000),
              'criterion': ['gini', 'entropy'],
              'max_features': [None, 'auto', 'sqrt', 'log2'],
              'max_depth': [None, 1, 3, 5, 7, 9]
             }

In [15]:
n_iter_search = 20
random_search = RandomizedSearchCV(modelo_rf, param_distributions=param_dist, cv=kfold, n_iter=n_iter_search, verbose=10, n_jobs=-1)

In [16]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  80 out of 100 | elapsed:    6.1s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  91 out of 100 | elapsed:    7.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.3s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=RandomForestClassifier(), n_iter=20, n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [None, 1, 3, 5, 7, 9],
                                        'max_features': [None, 'auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f062984c340>},
                   verbose=10)

In [17]:
print('Mejores parametros: ',random_search.best_params_)

Mejores parametros:  {'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 213}


In [18]:
resultados = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=kfold)

In [19]:
print("Precision - Train CV: ", resultados.mean())
print("Precision - Train : ", accuracy_score(random_search.best_estimator_.predict(X_train), y_train))
print("Precision - Train : ", accuracy_score(random_search.best_estimator_.predict(X_test), y_test))

Precision - Train CV:  0.7597092419522327
Precision - Train :  0.9869646182495344
Precision - Train :  0.8008658008658008
