# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

## Importing the dataset

In [2]:
data= pd.read_csv('pima-indians-diabetes.data',names=['preg','glu','bp','sft','ins','bmi','dpf','age','outcome'])

In [3]:
x = data.drop('outcome',axis=1)
y = data.outcome

# Train Test Split

In [4]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size =0.30,random_state=0)

# Modelling

## Linear Regression

In [5]:
LR = LogisticRegression(solver='liblinear')

In [6]:
LR.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
metrics.accuracy_score(ytest,LR.predict(xtest))

0.7835497835497836

In [8]:
cm_LR = metrics.confusion_matrix(ytest,LR.predict(xtest))
cm_LR

array([[142,  15],
       [ 35,  39]], dtype=int64)

## Naives Bayes

In [9]:
gnb = GaussianNB()

In [10]:
gnb.fit(xtrain,ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [11]:
metrics.accuracy_score(ytest,gnb.predict(xtest))

0.7619047619047619

In [12]:
cm_nb = metrics.confusion_matrix(ytest,gnb.predict(xtest))
cm_nb

array([[138,  19],
       [ 36,  38]], dtype=int64)

## Decision Tree 

In [13]:
dt = DecisionTreeClassifier()

In [14]:
parameter ={'max_depth':np.arange(1,10),'criterion':['entropy','gini']}
GS=GridSearchCV(dt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'criterion': ['entropy', 'gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [16]:
dt = DecisionTreeClassifier(criterion='entropy',max_depth=4)

In [17]:
dt.fit(xtrain,ytrain)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
metrics.accuracy_score(ytest,dt.predict(xtest))

0.7359307359307359

In [19]:
cm_dt=metrics.confusion_matrix(ytest,dt.predict(xtest))
cm_dt

array([[128,  29],
       [ 32,  42]], dtype=int64)

## Random Forest ( Bagging by nature)

In [20]:
rt = RandomForestClassifier(random_state=0)

In [21]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10),'criterion':['gini','entropy']}
GS=GridSearchCV(rt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 7}

In [23]:
rf = RandomForestClassifier(n_estimators=7,criterion='entropy',max_depth=5,random_state=0)

In [24]:
rf.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=7, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [25]:
cm_rf = metrics.confusion_matrix(ytest,rf.predict(xtest))
cm_rf

array([[145,  12],
       [ 42,  32]], dtype=int64)

# Gradient Boost

In [26]:
from sklearn.ensemble import BaggingClassifier,GradientBoostingClassifier

In [27]:
bg_LR = BaggingClassifier(base_estimator=LR,n_estimators=10,random_state=0)
bt_LR = AdaBoostClassifier(base_estimator=LR,n_estimators=150,random_state=0)

In [28]:
bg_gnb = BaggingClassifier(base_estimator=gnb,n_estimators=10,random_state=0)
bt_gnb = AdaBoostClassifier(base_estimator=gnb,n_estimators=150,random_state=0)

In [29]:
bt_dt = AdaBoostClassifier(base_estimator=dt,n_estimators=50,random_state=0)
gb_dt = GradientBoostingClassifier(n_estimators=100,random_state=0)

In [30]:
bt_rf = AdaBoostClassifier(base_estimator=rf,n_estimators=150,random_state=0)

In [31]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve,auc
kf = KFold(n_splits=5,shuffle=True,random_state=2)
for model,name in zip([LR,bg_LR,bt_LR,gnb,bg_gnb,bt_gnb,dt,bt_dt,gb_dt,rf,bt_rf],
                      ['LR','Bagg_LR','Boost_LR','gnb','Bagg_gnb','Boost_gnb','dt','Boost_dt','Grad_Boost_dt','rf','Boost_rf']):
    roc_auc=[]
    for train,test in kf.split(x,y):
        xtrain,xtest=x.iloc[train,:],x.iloc[test,:]
        ytrain,ytest=y[train],y[test]
        model.fit(xtrain,ytrain)
        y_predict = model.predict(xtest)
        cm=metrics.confusion_matrix(ytest,y_predict)
        fpr,tpr,_=roc_curve(ytest,y_predict)
        roc_auc.append(auc(fpr,tpr))
    print(' B.E: %0.02f /t/ V.E: (+/- %0.5f) [%s]' %(np.mean(roc_auc),np.var(roc_auc,ddof=1),name))

 B.E: 0.71 /t/ V.E: (+/- 0.00138) [LR]
 B.E: 0.71 /t/ V.E: (+/- 0.00178) [Bagg_LR]
 B.E: 0.70 /t/ V.E: (+/- 0.00229) [Boost_LR]
 B.E: 0.72 /t/ V.E: (+/- 0.00219) [gnb]
 B.E: 0.71 /t/ V.E: (+/- 0.00269) [Bagg_gnb]
 B.E: 0.53 /t/ V.E: (+/- 0.00399) [Boost_gnb]
 B.E: 0.73 /t/ V.E: (+/- 0.00063) [dt]
 B.E: 0.72 /t/ V.E: (+/- 0.00233) [Boost_dt]
 B.E: 0.72 /t/ V.E: (+/- 0.00198) [Grad_Boost_dt]
 B.E: 0.69 /t/ V.E: (+/- 0.00103) [rf]
 B.E: 0.69 /t/ V.E: (+/- 0.00043) [Boost_rf]


# Inference :
    Gradient Boost is only applicable for Decision tree 
    In this case it has reduced the variance error and the bias error remains the same.
    Furthur reduction in the errors can be done by scaling and feature selection.