# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset

In [2]:
data= pd.read_csv('wine.xls',names=['cultivator','alcohol','malic_acid','ash',
                                    'alcalinity','magnesium','total_phenols','falvanoids',
                                    'nonflavanoid_phenols','proanthocyanins','color',
                                    'hue','od280','proline'])

In [3]:
data.head()

Unnamed: 0,cultivator,alcohol,malic_acid,ash,alcalinity,magnesium,total_phenols,falvanoids,nonflavanoid_phenols,proanthocyanins,color,hue,od280,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
x = data.drop('cultivator',axis=1)
y = data.cultivator

# Train Test Split

In [5]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size =0.30,random_state=0)

# Modelling

## Linear Regression

In [6]:
LR = LogisticRegression(solver='liblinear')

In [7]:
LR.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
metrics.accuracy_score(ytest,LR.predict(xtest))

0.9444444444444444

In [9]:
cm_LR = metrics.confusion_matrix(ytest,LR.predict(xtest))
cm_LR

array([[18,  1,  0],
       [ 1, 20,  1],
       [ 0,  0, 13]], dtype=int64)

In [10]:
print(metrics.classification_report(ytest,LR.predict(xtest)))

              precision    recall  f1-score   support

           1       0.95      0.95      0.95        19
           2       0.95      0.91      0.93        22
           3       0.93      1.00      0.96        13

   micro avg       0.94      0.94      0.94        54
   macro avg       0.94      0.95      0.95        54
weighted avg       0.94      0.94      0.94        54



## KNN Model

In [11]:
knn = KNeighborsClassifier()

In [12]:
parameter = {'n_neighbors':np.arange(1,15),'weights':['uniform','distance']}
GS=GridSearchCV(knn,parameter,cv=5)
GS.fit(x,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
GS.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [14]:
knn = KNeighborsClassifier(n_neighbors=1,weights='uniform',metric='euclidean')

In [15]:
knn.fit(xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [16]:
metrics.accuracy_score(ytest,knn.predict(xtest))

0.7592592592592593

In [17]:
cm_knn = metrics.confusion_matrix(ytest,knn.predict(xtest))
cm_knn

array([[16,  1,  2],
       [ 1, 18,  3],
       [ 1,  5,  7]], dtype=int64)

In [18]:
print(metrics.classification_report(ytest,knn.predict(xtest)))

              precision    recall  f1-score   support

           1       0.89      0.84      0.86        19
           2       0.75      0.82      0.78        22
           3       0.58      0.54      0.56        13

   micro avg       0.76      0.76      0.76        54
   macro avg       0.74      0.73      0.74        54
weighted avg       0.76      0.76      0.76        54



## Naives Bayes

In [19]:
gnb = GaussianNB()

In [20]:
gnb.fit(xtrain,ytrain)

GaussianNB(priors=None, var_smoothing=1e-09)

In [21]:
metrics.accuracy_score(ytest,gnb.predict(xtest))

0.9444444444444444

In [22]:
cm_nb = metrics.confusion_matrix(ytest,gnb.predict(xtest))
cm_nb

array([[19,  0,  0],
       [ 2, 19,  1],
       [ 0,  0, 13]], dtype=int64)

In [23]:
print(metrics.classification_report(ytest,gnb.predict(xtest)))

              precision    recall  f1-score   support

           1       0.90      1.00      0.95        19
           2       1.00      0.86      0.93        22
           3       0.93      1.00      0.96        13

   micro avg       0.94      0.94      0.94        54
   macro avg       0.94      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54



## Decision Tree

In [24]:
dt = DecisionTreeClassifier()

In [25]:
parameter ={'max_depth':np.arange(1,10),'criterion':['entropy','gini']}
GS=GridSearchCV(dt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'criterion': ['entropy', 'gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
GS.best_params_

{'criterion': 'gini', 'max_depth': 3}

In [27]:
dt = DecisionTreeClassifier(criterion='gini',max_depth=3)

In [28]:
dt.fit(xtrain,ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [29]:
metrics.accuracy_score(ytest,dt.predict(xtest))

0.9444444444444444

In [30]:
cm_dt=metrics.confusion_matrix(ytest,dt.predict(xtest))
cm_dt

array([[17,  2,  0],
       [ 0, 21,  1],
       [ 0,  0, 13]], dtype=int64)

In [31]:
print(metrics.classification_report(ytest,dt.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      0.89      0.94        19
           2       0.91      0.95      0.93        22
           3       0.93      1.00      0.96        13

   micro avg       0.94      0.94      0.94        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54



## Random Forest 

In [32]:
rt = RandomForestClassifier(random_state=0)

In [33]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10),'criterion':['gini','entropy']}
GS=GridSearchCV(rt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 8}

In [35]:
rf = RandomForestClassifier(n_estimators=8,criterion='entropy',max_depth=5,random_state=0)

In [36]:
rf.fit(xtrain,ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [37]:
metrics.accuracy_score(ytest,rf.predict(xtest))

1.0

In [38]:
cm_rf = metrics.confusion_matrix(ytest,rf.predict(xtest))
cm_rf

array([[19,  0,  0],
       [ 0, 22,  0],
       [ 0,  0, 13]], dtype=int64)

In [39]:
print(metrics.classification_report(ytest,rf.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        19
           2       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        13

   micro avg       1.00      1.00      1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



# Bagging Classifier

In [40]:
from sklearn.ensemble import BaggingClassifier

### Bagging Logistic Regression

In [41]:
bg_LR = BaggingClassifier(base_estimator=LR,n_estimators=10,random_state=0)
bg_LR.fit(xtrain,ytrain)
print(metrics.classification_report(ytest,bg_LR.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.95      0.95      0.95        22
           3       0.93      1.00      0.96        13

   micro avg       0.96      0.96      0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.96      0.96      0.96        54



### Bagging K Nearest Neighbour

In [42]:
bg_knn = BaggingClassifier(base_estimator=knn,n_estimators=10,random_state=0)
bg_knn.fit(xtrain,ytrain)
print(metrics.classification_report(ytest,bg_knn.predict(xtest)))

              precision    recall  f1-score   support

           1       0.89      0.84      0.86        19
           2       0.69      0.82      0.75        22
           3       0.50      0.38      0.43        13

   micro avg       0.72      0.72      0.72        54
   macro avg       0.69      0.68      0.68        54
weighted avg       0.72      0.72      0.71        54



### Bagging Gaussian Bayes

In [43]:
bg_gnb = BaggingClassifier(base_estimator=gnb,n_estimators=10,random_state=0)
bg_gnb.fit(xtrain,ytrain)
print(metrics.classification_report(ytest,bg_gnb.predict(xtest)))

              precision    recall  f1-score   support

           1       0.95      1.00      0.97        19
           2       1.00      0.95      0.98        22
           3       1.00      1.00      1.00        13

   micro avg       0.98      0.98      0.98        54
   macro avg       0.98      0.98      0.98        54
weighted avg       0.98      0.98      0.98        54



### Bagging Decision Tree

In [44]:
bg_dt = BaggingClassifier(base_estimator=dt,n_estimators=10,random_state=0)
bg_dt.fit(xtrain,ytrain)
print(metrics.classification_report(ytest,bg_dt.predict(xtest)))

              precision    recall  f1-score   support

           1       1.00      0.95      0.97        19
           2       0.95      0.95      0.95        22
           3       0.93      1.00      0.96        13

   micro avg       0.96      0.96      0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.96      0.96      0.96        54



# Checking the f1 weighted scores of all the models

In [45]:
models = []
models.append(('Base LR',LR))
models.append(('Bagged LR',bg_LR))
models.append(('Base knn',knn))
models.append(('Bagged knn',bg_knn))
models.append(('Base gnb',gnb))
models.append(('Bagged gnb',bg_gnb))
models.append(('Base dt',dt))
models.append(('Bagged dt',bg_dt))
models.append(('Base rf',rf))

## K-Fold Cross Validation

In [46]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3,shuffle=True,random_state=2)
for model,name in zip([LR,bg_LR,knn,bg_knn,gnb,bg_gnb,dt,bg_dt,rf],
                      ['LR','Bagg_LR','knn','Bagg_knn','gnb','Bagg_gnb','dt','Bagg_dt','rf']):
    k=0
    recall=np.zeros((3,3))
    prec  =np.zeros((3,3))
    fscore=np.zeros((3,3))
    for train,test in kf.split(x,y):
        xtrain,xtest=x.iloc[train,:],x.iloc[test,:]
        ytrain,ytest=y[train],y[test]
        model.fit(xtrain,ytrain)
        y_predict = model.predict(xtest)
        cm=metrics.confusion_matrix(ytest,y_predict)
        for i in np.arange(0,3):
            recall[i,k]=cm[i,i]/cm[i,:].sum()
            prec[i,k]=cm[i,i]/cm[:,i].sum()
        k=k+1
    for row in np.arange(0,3):
        for col in np.arange(0,3):
            fscore[row,col]=2*(recall[row,col]*prec[row,col])/(recall[row,col]+prec[row,col])
    mean =(np.mean(fscore[0,:])+np.mean(fscore[1,:])+np.mean(fscore[2,:]))/3
    var  =((np.var(fscore[0,:],ddof=1)+np.var(fscore[1,:],ddof=1)+np.var(fscore[2,:],ddof=1))/3)
    print('f1_weighted Score: %0.02f (+/- %0.5f) [%s]' %(mean,var,name))
    

f1_weighted Score: 0.95 (+/- 0.00033) [LR]
f1_weighted Score: 0.94 (+/- 0.00156) [Bagg_LR]
f1_weighted Score: 0.72 (+/- 0.00096) [knn]
f1_weighted Score: 0.73 (+/- 0.00075) [Bagg_knn]
f1_weighted Score: 0.97 (+/- 0.00112) [gnb]
f1_weighted Score: 0.96 (+/- 0.00067) [Bagg_gnb]
f1_weighted Score: 0.87 (+/- 0.00201) [dt]
f1_weighted Score: 0.93 (+/- 0.00150) [Bagg_dt]
f1_weighted Score: 0.93 (+/- 0.00184) [rf]


# Inference :
    From the above scores we can say that Bagging the models has decreased their variance error for some models.
    Out of all models Naives Bayes model has shown good improvement in variance error with a trade off bias error