In [10]:
import pandas as pd
df = pd.read_csv('diabetes.csv')

In [11]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [12]:
import numpy as np
df['Glucose'] = np.where(df['Glucose']==0, df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0, df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0, df['SkinThickness'].median(),df['SkinThickness'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [13]:
## Independent and depentent features
X = df.drop('Outcome',axis=1)
y = df['Outcome']

In [14]:
pd.DataFrame(X,columns=df.columns[:-1])

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76,48.0,180.0,32.9,0.171,63
764,2,122.0,70,27.0,30.5,36.8,0.340,27
765,5,121.0,72,23.0,112.0,26.2,0.245,30
766,1,126.0,60,23.0,30.5,30.1,0.349,47


In [19]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=33)

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction=rf_classifier.predict(X_test)

In [21]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [29]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[87 12]
 [32 23]]
0.7142857142857143
              precision    recall  f1-score   support

           0       0.73      0.88      0.80        99
           1       0.66      0.42      0.51        55

    accuracy                           0.71       154
   macro avg       0.69      0.65      0.65       154
weighted avg       0.70      0.71      0.70       154



### Randomized Search cv

In [43]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
#number of tree in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num =10)]
#number of Features to consider at every split 
max_features = ['auto','sqrt','log2']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,1000,10)]
#minimum number of samples required to split a node
min_samples_split = [2,3,7,5,8,9]
#minimum number of samples required at each leaf node
min_samples_leaf = [1,2,3,5,9]
# Create the random grid 
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 3, 7, 5, 8, 9], 'min_samples_leaf': [1, 2, 3, 5, 9], 'criterion': ['entropy', 'gini']}


In [44]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=100,n_jobs=-1)

#fit the randomized model
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [10, 120, 230, 340, 450,
                                                      560, 670, 780, 890,
                                                      1000],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 5, 9],
                                        'min_samples_split': [2, 3, 7, 5, 8, 9],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=100, verbose=2)

In [45]:
rf_randomcv.best_params_

{'n_estimators': 1800,
 'min_samples_split': 8,
 'min_samples_leaf': 9,
 'max_features': 'log2',
 'max_depth': 450,
 'criterion': 'gini'}

In [46]:
rf_randomcv.best_estimator_

RandomForestClassifier(max_depth=450, max_features='log2', min_samples_leaf=9,
                       min_samples_split=8, n_estimators=1800)

In [47]:
best_random_grid=rf_randomcv.best_estimator_

In [48]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print('Accuracy Score {}'.format(accuracy_score(y_test,y_pred)))
print('Classification report: {}'.format(classification_report(y_test,y_pred)))

[[87 12]
 [27 28]]
Accuracy Score 0.7467532467532467
Classification report:               precision    recall  f1-score   support

           0       0.76      0.88      0.82        99
           1       0.70      0.51      0.59        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.74       154



## Gridsearch cv

In [49]:
rf_randomcv.best_params_

{'n_estimators': 1800,
 'min_samples_split': 8,
 'min_samples_leaf': 9,
 'max_features': 'log2',
 'max_depth': 450,
 'criterion': 'gini'}

In [50]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)


{'criterion': ['gini'], 'max_depth': [450], 'max_features': ['log2'], 'min_samples_leaf': [9, 11, 13], 'min_samples_split': [6, 7, 8, 9, 10], 'n_estimators': [1600, 1700, 1800, 1900, 2000]}


In [55]:
# fit the grid searchcv of the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini'], 'max_depth': [450],
                         'max_features': ['log2'],
                         'min_samples_leaf': [9, 11, 13],
                         'min_samples_split': [6, 7, 8, 9, 10],
                         'n_estimators': [1600, 1700, 1800, 1900, 2000]},
             verbose=2)

In [56]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=450, max_features='log2', min_samples_leaf=9,
                       min_samples_split=6, n_estimators=1800)

In [57]:
best_grid=grid_search.best_estimator_

In [58]:
best_grid

RandomForestClassifier(max_depth=450, max_features='log2', min_samples_leaf=9,
                       min_samples_split=6, n_estimators=1800)

In [59]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[87 12]
 [27 28]]
Accuracy Score 0.7467532467532467
Classification report:               precision    recall  f1-score   support

           0       0.76      0.88      0.82        99
           1       0.70      0.51      0.59        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.74       154

