<a href="https://colab.research.google.com/github/Laura-Neff/VotingClassifier/blob/main/VotingClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
diabetes_data = pd.read_csv('PimaIndians_processed.csv')

diabetes_data.head(10)

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,diabetes,age,test
0,-0.717427,-1.091046,-0.373655,-0.58511,-0.522842,-0.710421,-1.031876,-0.968299,0
1,-1.029213,0.466314,-2.456964,0.557421,0.100631,1.42673,5.115111,0.209585,1
2,-0.093854,-1.447941,-1.655691,0.271788,-0.573394,-0.297238,-0.797126,-0.477514,1
3,-0.40564,2.413014,-0.053146,1.50953,3.260122,-0.368477,-1.05796,2.172726,1
4,-0.717427,2.153454,-0.854419,-0.58511,5.81299,-0.425468,-0.362402,2.761668,1
5,0.529718,1.407219,0.107109,-0.965953,0.159608,-1.038117,0.185349,1.976412,1
6,-1.029213,-0.150141,1.068636,1.699951,0.623,1.811417,0.081015,0.013271,1
7,-0.717427,-0.636816,-3.258237,0.843053,-0.61552,1.455225,-0.985506,0.209585,0
8,-0.717427,-0.247476,-0.053146,0.081366,-0.505991,0.215678,0.017256,0.111428,1
9,-0.093854,0.109419,1.389146,1.128686,0.665127,0.885318,0.524433,-0.379357,0


In [None]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pregnant   392 non-null    float64
 1   glucose    392 non-null    float64
 2   diastolic  392 non-null    float64
 3   triceps    392 non-null    float64
 4   insulin    392 non-null    float64
 5   bmi        392 non-null    float64
 6   diabetes   392 non-null    float64
 7   age        392 non-null    float64
 8   test       392 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 27.7 KB


In [None]:
X = diabetes_data.drop('test', axis=1)

Y = diabetes_data['test']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
x_train.shape, y_train.shape

((313, 8), (313,))

In [None]:
x_test.shape, y_test.shape

((79, 8), (79,))

##Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [None]:
log_clf = LogisticRegression(C=1, solver='liblinear')

svc_clf = SVC(C=1, kernel='linear', gamma='auto')

naive_clf = GaussianNB()

##Hard voting
Hard voting classifier (voting='hard') aggregate the predictions of each classifier and predict the class that gets the most votes

In [None]:
voting_clf_hard = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('svc', svc_clf), 
                                               ('naive', naive_clf)],
                                   voting='hard')

#hard voting = majority vote 

In [None]:
voting_clf_hard.fit(x_train, y_train)

#train the voting classifier with training data

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1, solver='liblinear')),
                             ('svc', SVC(C=1, gamma='auto', kernel='linear')),
                             ('naive', GaussianNB())])

In [None]:
y_pred = voting_clf_hard.predict(x_test)
#find majority vote of the predictors

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7848101265822784

In [None]:
voting_clf_hard.classes_

array([0, 1])

In [None]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.head()

Unnamed: 0,y_test,y_pred
69,0,0
177,0,0
119,0,1
190,0,0
132,0,1


In [None]:
for clf_hard in (log_clf, svc_clf, naive_clf, voting_clf_hard):
    
    clf_hard.fit(x_train, y_train)
    y_pred = clf_hard.predict(x_test)
    
    print(clf_hard.__class__.__name__, accuracy_score(y_test, y_pred))

    #You can see that SVC performed best 
    #Usually the majority vote ensemble technique performs better than any of the algorithms

LogisticRegression 0.7848101265822784
SVC 0.7848101265822784
GaussianNB 0.759493670886076
VotingClassifier 0.7848101265822784


##Soft voting
Soft voting classifier (voting='soft') predict the class with the highest class probability, averaged over all the individual classifiers.

All of the above classifiers are by default enabled for probability estimates except SVC.
So in SVC we have to pass one more parameter as probability=True

In [None]:
svc_clf_soft = SVC(C=1, kernel='linear', gamma='auto', probability=True)

In [None]:
voting_clf_soft = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('svc', svc_clf_soft), 
                                               ('naive', naive_clf)],
                                   voting='soft', 
                                   weights = [0.25,0.5,0.25])

#assigns weights to each algorithm in the ensemble

In [None]:
for clf_soft in (log_clf, svc_clf_soft, naive_clf, voting_clf_soft):
    
    clf_soft.fit(x_train, y_train)
    y_pred = clf_soft.predict(x_test)
    
    print(clf_soft.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.7848101265822784
SVC 0.7848101265822784
GaussianNB 0.759493670886076
VotingClassifier 0.7721518987341772
