In [47]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [48]:
df = pd.read_csv('diabetes.csv')

In [49]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [50]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [51]:
x = df.drop(columns='Outcome')
y = df['Outcome']

In [52]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

x = scale.fit_transform(x)

In [53]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2,random_state=42)

In [54]:
feature_cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin',
                'BMI','DiabetesPedigreeFunction','Age']

x_train = pd.DataFrame(x_train, columns=feature_cols)
x_test = pd.DataFrame(x_test, columns=feature_cols)

from sklearn.impute import SimpleImputer
cols_with_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

for col in cols_with_zero:
    x_train[col] = x_train[col].replace(0, np.nan)
    x_test[col] = x_test[col].replace(0, np.nan)



imp = SimpleImputer(strategy='median')
x_train = pd.DataFrame(imp.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(imp.transform(x_test), columns=x_test.columns)


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

m = LogisticRegression()

m.fit(x_train,y_train)

yy = m.predict(x_test)

accuracy_score(y_test,yy)

0.7532467532467533

In [56]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix

rf = RandomForestClassifier(random_state=42)

param = {
    'n_estimators' : np.arange(20,300,30),
    'max_depth' : np.arange(3,40,30),
    'criterion' : ['gini','entropy']
}

search  = RandomizedSearchCV(
    rf,
    param,
    scoring ='f1',
    n_iter=40,
    n_jobs=1,
    random_state=42
)

In [57]:
search.fit(x_train,y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_distributions,"{'criterion': ['gini', 'entropy'], 'max_depth': array([ 3, 33]), 'n_estimators': array([ 20, ...30, 260, 290])}"
,n_iter,40
,scoring,'f1'
,n_jobs,1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,n_estimators,np.int64(290)
,criterion,'gini'
,max_depth,np.int64(33)
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [58]:
search.best_score_

np.float64(0.6617674035606524)

In [59]:
model = search.best_estimator_

In [60]:
y_pred = model.predict(x_test)

In [61]:
accuracy_score(y_test,y_pred)

0.7402597402597403

In [62]:
from sklearn.svm import SVC

svm = SVC(kernel='linear')

svm.fit(x_train,y_train)

y_pre = svm.predict(x_test)

accuracy_score(y_test,y_pre)

0.7597402597402597

In [63]:
from sklearn.ensemble import AdaBoostClassifier

ada  = AdaBoostClassifier()

ada.fit(x_train,y_train)

y_pree = ada.predict(x_test)

accuracy_score(y_test,y_pree)


0.7792207792207793

In [65]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

ada  = AdaBoostClassifier()

parameters = {
    'n_estimators': np.arange(75,200,15),
    'learning_rate': np.arange(.1,2,.3)
}

grid = GridSearchCV(
    ada,
    parameters,
    n_jobs=1,
    scoring='accuracy',
    cv=5
)

grid.fit(x_train,y_train)

0,1,2
,estimator,AdaBoostClassifier()
,param_grid,"{'learning_rate': array([0.1, 0....3, 1.6, 1.9]), 'n_estimators': array([ 75, ...65, 180, 195])}"
,scoring,'accuracy'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,estimator,
,n_estimators,np.int64(180)
,learning_rate,np.float64(1.3000000000000003)
,algorithm,'deprecated'
,random_state,


In [66]:
display(f'Best Training Score: {grid.best_score_}')
b_model = grid.best_estimator_
yb = b_model.predict(x_test)
display(f'Test Score: {accuracy_score(y_test,yb)}')

'Best Training Score: 0.7833933093429295'

'Test Score: 0.7597402597402597'

In [67]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.5, 1.0, 1.5],
}

grid = GridSearchCV(
    AdaBoostClassifier(),
    params,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid.fit(x_train, y_train)

print(grid.best_params_)
print(grid.best_score_)


{'learning_rate': 1.0, 'n_estimators': 100}
0.7817939490870318
