In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("diabetes_feature.csv")
df.shape

(768, 16)

In [2]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [4]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness', 
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high', 'Insulin_nan', 'low_glu_insulin']]

y = df['Outcome']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

X_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Pregnancies_high,Insulin_nan,low_glu_insulin
60,84,0,0,0.0,0.304,21,False,102.5,True
618,112,82,24,28.2,1.282,50,True,196.5,False
346,139,46,19,28.7,0.654,22,False,83.0,False
294,161,50,0,21.9,0.254,65,False,102.5,False
231,134,80,37,46.2,0.238,46,False,370.0,False
...,...,...,...,...,...,...,...,...,...
71,139,64,35,28.6,0.411,26,False,140.0,False
106,96,122,0,22.4,0.207,27,False,102.5,True
270,101,86,37,45.6,1.136,38,True,196.5,False
435,141,0,0,42.4,0.205,29,False,196.5,False


In [5]:
X_train.shape, y_train.shape

((614, 9), (614,))

In [6]:
X_test.shape, y_test.shape

((154, 9), (154,))

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

estimators = [DecisionTreeClassifier(random_state=42), 
             RandomForestClassifier(random_state=42),
             GradientBoostingClassifier(random_state=42)]
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [26]:
max_depth = np.random.randint(2, 20, 10)
max_depth

array([15, 10, 11,  3,  4, 14, 12,  4, 13, 14])

In [27]:
max_features = np.random.uniform(0.3, 1.0, 10)
max_features

array([0.34404898, 0.6716441 , 0.47820492, 0.76844985, 0.66399051,
       0.78053916, 0.94507673, 0.98520307, 0.46733419, 0.40148657])

In [31]:
results = []

for estimator in estimators:
    result = []
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [44]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {'max_depth': max_depth, 'max_features': max_features}

results = []
for estimator in estimators:
    result = []
    if estimator.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions['n_estimators'] = np.random.randint(100, 200, 10)
    clf = RandomizedSearchCV(estimator, param_distributions, n_iter=100, scoring='accuracy', n_jobs=-1, cv=5, verbose=2)

    clf.fit(X_train, y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test,y_test))
    result.append(clf.cv_results_)
    results.append(result)
results

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END .....max_depth=15, max_features=0.34404897777236454; total time=   0.0s
[CV] END .....max_depth=15, max_features=0.34404897777236454; total time=   0.0s
[CV] END .....max_depth=15, max_features=0.34404897777236454; total time=   0.0s
[CV] END .....max_depth=15, max_features=0.34404897777236454; total time=   0.0s
[CV] END .....max_depth=15, max_features=0.34404897777236454; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.6716440968082537; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.6716440968082537; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.6716440968082537; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.6716440968082537; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.6716440968082537; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.4782049166978379; total time=   0.0s
[CV] END ......max_depth=15, max_features=0.47

[['DecisionTreeClassifier',
  {'max_features': 0.7805391583585548, 'max_depth': 4},
  0.8795281887245103,
  0.8766233766233766,
  {'mean_fit_time': array([0.00184455, 0.00188832, 0.00161018, 0.00172276, 0.00195255,
          0.00200748, 0.0020946 , 0.00206952, 0.00168304, 0.00148377,
          0.00134664, 0.00171151, 0.00191746, 0.00197458, 0.00165982,
          0.00235486, 0.0023634 , 0.00211639, 0.00168209, 0.00194736,
          0.00151248, 0.0020287 , 0.0016099 , 0.00141459, 0.0013464 ,
          0.00224948, 0.00154066, 0.00160489, 0.00546522, 0.00169015,
          0.00102139, 0.00138292, 0.00406699, 0.00136085, 0.00096436,
          0.00107083, 0.00262661, 0.00145879, 0.00090032, 0.00083208,
          0.00270157, 0.00164413, 0.00098453, 0.00124888, 0.00102377,
          0.00208621, 0.00278139, 0.00375857, 0.01132884, 0.00134749,
          0.00273962, 0.00583138, 0.00389776, 0.00258574, 0.00583143,
          0.00245075, 0.00193982, 0.00454917, 0.00976405, 0.00634742,
          0.001

In [54]:
df = pd.DataFrame(results, columns=['estimator', 'best_params', 'train_score', 'test_score', 'cv_result'])

In [56]:
pd.DataFrame(df.loc[1, 'cv_result']).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
21,0.108211,0.007987,0.003830,0.002003,107,0.478205,11,"{'n_estimators': 107, 'max_features': 0.478204...",0.861789,0.959350,0.878049,0.902439,0.942623,0.908850,0.037136,1
38,0.127434,0.004441,0.004553,0.001307,146,0.401487,15,"{'n_estimators': 146, 'max_features': 0.401486...",0.869919,0.943089,0.878049,0.910569,0.934426,0.907210,0.029259,2
32,0.150875,0.012409,0.007575,0.003603,184,0.344049,13,"{'n_estimators': 184, 'max_features': 0.344048...",0.869919,0.943089,0.878049,0.902439,0.934426,0.905584,0.029253,3
45,0.100781,0.005610,0.002834,0.000023,107,0.478205,14,"{'n_estimators': 107, 'max_features': 0.478204...",0.861789,0.959350,0.869919,0.902439,0.934426,0.905584,0.037209,3
76,0.152068,0.017793,0.004055,0.000556,146,0.401487,14,"{'n_estimators': 146, 'max_features': 0.401486...",0.869919,0.934959,0.878049,0.910569,0.934426,0.905584,0.027386,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,0.107414,0.019663,0.003743,0.000905,138,0.76845,3,"{'n_estimators': 138, 'max_features': 0.768449...",0.821138,0.934959,0.845528,0.869919,0.934426,0.881194,0.046325,93
50,0.117081,0.007093,0.008744,0.004774,146,0.344049,3,"{'n_estimators': 146, 'max_features': 0.344048...",0.829268,0.910569,0.861789,0.878049,0.926230,0.881181,0.034565,97
72,0.172530,0.009886,0.005328,0.001875,184,0.945077,3,"{'n_estimators': 184, 'max_features': 0.945076...",0.829268,0.926829,0.829268,0.878049,0.934426,0.879568,0.045399,98
8,0.172435,0.014047,0.007230,0.003516,174,0.478205,3,"{'n_estimators': 174, 'max_features': 0.478204...",0.821138,0.902439,0.861789,0.869919,0.926230,0.876303,0.035973,99


In [53]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,False,False,True,False,196.5,5.285739,False
1,1,85,66,29,0,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,0,23.3,0.672,32,1,True,False,True,False,196.5,5.285739,False
3,1,89,66,23,94,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False
