In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from mlxtend.classifier import StackingCVClassifier
from sklearn.naive_bayes import GaussianNB
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings 
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv(r'C:\Users\jayac\Downloads\Dataset\diabetes_data.csv')

In [4]:
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
X=df.drop('diabetes',1)
y=df['diabetes']

In [11]:
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=100
                                              ,test_size=0.2,stratify=y)

In [12]:
df['diabetes'].value_counts()

0    500
1    268
Name: diabetes, dtype: int64

## Model

In [14]:
knn=KNeighborsClassifier()
params_knn={'n_neighbors':np.arange(1,25)}
knn_gs=GridSearchCV(knn,params_knn,cv=5)
knn_gs.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])})

In [15]:
knn_best=knn_gs.best_estimator_
knn_best

KNeighborsClassifier(n_neighbors=12)

In [19]:
rf= RandomForestClassifier()
params={'n_estimators':np.arange(5,25),
      'min_samples_split':np.arange(3,8),
       'min_samples_leaf':np.arange(3,6),
       'max_features':np.arange(0.1,1.1,0.1),
       'oob_score':[True]
       }
rfgs=GridSearchCV(rf,params,cv=5)
rfgs.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_features': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'min_samples_leaf': array([3, 4, 5]),
                         'min_samples_split': array([3, 4, 5, 6, 7]),
                         'n_estimators': array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24]),
                         'oob_score': [True]})

In [26]:
rf_best=rfgs.best_estimator_

In [38]:
rf_best

RandomForestClassifier(max_features=1.0, min_samples_leaf=5,
                       min_samples_split=7, n_estimators=15, oob_score=True)

In [23]:
lr=LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression()

## Test Accuracy

In [27]:
print('knn: {}'.format(knn_best.score(x_test,y_test)))
print('rf: {}'.format(rf_best.score(x_test,y_test)))
print('lr: {}'.format(lr.score(x_test,y_test)))

knn: 0.7467532467532467
rf: 0.8051948051948052
lr: 0.7792207792207793


In [30]:
estimators=[('knn',knn_best),('rf',rf_best),('lr',lr)]


In [36]:
#Voting CLassifiers
vc=VotingClassifier(estimators,voting='hard')
vc.fit(x_train,y_train)
vc.score(x_test,y_test)

0.7792207792207793

## Stacking

In [41]:
clf1= KNeighborsClassifier(n_neighbors=10)
clf2=RandomForestClassifier()
clf3= GaussianNB()
lr=LogisticRegression()

In [44]:
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3],
                          meta_classifier=lr,
                          random_state=100
                         )
sclf.fit(x_train,y_train)
sclf.score(x_test,y_test)

0.8051948051948052

In [45]:
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3],
                          meta_classifier=lr,
                          random_state=100,
                      use_probas=True
                         )
sclf.fit(x_train,y_train)
sclf.score(x_test,y_test)

0.7792207792207793

In [48]:
variance_inflation_factor(x_train.values,0)

3.3146383986776966

In [49]:
for i in range(len(x_train.columns)):
    print(variance_inflation_factor(x_train.values,i))

3.3146383986776966
16.41243550011059
14.161357815011796
4.070779240453858
2.0411098623558077
18.0534181270277
3.2310534164156697
13.603183611184328
