# Voting Classifier

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()
X,y = iris.data[:,1:3],iris.target

In [2]:
#split the dataset into training and testing data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.30,random_state = 0)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [4]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
gnb_clf = GaussianNB()

In [5]:
voting_clf = VotingClassifier(estimators=[('lr',log_clf),('rf',rnd_clf),('gb',gnb_clf)],
                              voting = 'soft')
voting_clf.fit(X_train,y_train)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo...ne, verbose=0,
            warm_start=False)), ('gb', GaussianNB(priors=None, var_smoothing=1e-09))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [6]:
from sklearn.metrics import accuracy_score
for clf in (log_clf,rnd_clf,gnb_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

LogisticRegression 0.7555555555555555
RandomForestClassifier 0.9333333333333333
GaussianNB 0.9555555555555556
VotingClassifier 0.9333333333333333




Ensemble models work best when the predictors are as independent from one another as possible.One way to get diverse classifes is to train them using very different algorithms.This increases 
the chances that they will make very differnet types of errors,improving the 
ensemble's accuracy.

Thus the voting classifier slighltly outperforms all the individual classifiers,if all the classifiers are able to estimate the class probabilities (they have predict_proba() method)then we can predict the class with the highest class probability,averaged all over the individual classifiers,this is called "SoftVoting".It oftens achieves higher performance than hardvoting because it gives more weigh to highly confident votes

One way to get a diverse set of classifiers is to use very different training algorithms as just discussed.Another approach is to use the same training algorithm for every predictor,but to train them on different random subsets of the training set.When the sampling is performed with replacement,this method is called "Bagging" [Bootstrap Aggregating] When sampling is performd without replacement,it is called as "Pasting".Thus bagging and pasting allows training instances to be sampled several times for the same predictor.

In [9]:
import pandas as pd
pima= pd.read_csv("diabetes.csv")
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [13]:
#Split in features and target variable
feature_cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
X= pima[feature_cols]
y = pima.Outcome

In [25]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf1 = BaggingClassifier(DecisionTreeClassifier(),n_estimators = 100,
                            max_samples= 10,bootstrap=True,oob_score = True)
#It trains an ensmeble of 100DecisionTree classifiers each trained with 10 training instances randomly sampled 
#from training set with replacement 
#n_jobs parameter tells scikit-learn the number of CPU cores to use for traning and predictions

In [21]:
#split the dataset into training and testing data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.25,random_state = 0)

In [23]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=100,bootstrap = True,
                           n_jobs = -1,oob_score = True)

In [24]:
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.7309027777777778

In [26]:
bag_clf1.fit(X_train,y_train)
bag_clf1.oob_score_

0.7465277777777778

In [27]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8072916666666666

In [28]:
y_pred = bag_clf1.predict(X_test)
accuracy_score(y_test,y_pred)

0.765625

According to the oob evaluation,the Bagging classifier is likely achieve more accuracy
on the test

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import load_breast_cancer
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [10]:
breast_cancer = load_breast_cancer()
X = pd.DataFrame(breast_cancer.data,columns = breast_cancer.feature_names)
y = pd.Categorical.from_codes(breast_cancer.target,breast_cancer.target_names)

In [11]:
y

[malignant, malignant, malignant, malignant, malignant, ..., malignant, malignant, malignant, malignant, benign]
Length: 569
Categories (2, object): [malignant, benign]

In [12]:
#Whenever we are working with categorical feature,we must encode it as numbers.
encoder = LabelEncoder()
binary_encoded_y = pd.Series(encoder.fit_transform(y))

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,binary_encoded_y,random_state =1)

In [41]:
classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2),
                                n_estimators = 200)
classifier.fit(X_train,y_train)
#SAMME.R - Stagewise Additive Modelling using a multiclass Exponential Loss function Real

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

SAMME.R relies on class probabilities rather than the predictions and generally performs better

In [42]:
predictions = classifier.predict(X_test)

In [43]:
confusion_matrix(y_test,predictions)

array([[88,  0],
       [ 4, 51]], dtype=int64)

In [44]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.972027972027972

In [63]:
#GradientBoosting
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

In [64]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

We may need to do some preprocessing

In [65]:
y_train = train_data["Survived"]
train_data.drop(labels ='Survived',axis = 1,inplace = True)

In [66]:
full_data = train_data.append(test_data)

In [67]:
drop_columns = ['Name','Age','SibSp','Ticket','Cabin','Parch','Embarked']
full_data.drop(labels = drop_columns,axis=1,inplace = True)

In [68]:
full_data

Unnamed: 0,PassengerId,Pclass,Sex,Fare
0,1,3,male,7.2500
1,2,1,female,71.2833
2,3,3,female,7.9250
3,4,1,female,53.1000
4,5,3,male,8.0500
...,...,...,...,...
413,1305,3,male,8.0500
414,1306,1,female,108.9000
415,1307,3,male,7.2500
416,1308,3,male,8.0500


In [69]:
full_data = pd.get_dummies(full_data,columns = ["Sex"])
full_data

Unnamed: 0,PassengerId,Pclass,Fare,Sex_female,Sex_male
0,1,3,7.2500,0,1
1,2,1,71.2833,1,0
2,3,3,7.9250,1,0
3,4,1,53.1000,1,0
4,5,3,8.0500,0,1
...,...,...,...,...,...
413,1305,3,8.0500,0,1
414,1306,1,108.9000,1,0
415,1307,3,7.2500,0,1
416,1308,3,8.0500,0,1


In [71]:
full_data.fillna(value = 0.0,inplace = True)

In [72]:
full_data.isnull().sum()

PassengerId    0
Pclass         0
Fare           0
Sex_female     0
Sex_male       0
dtype: int64

In [73]:
#Lets split the data
X_train = full_data.values[0:891]
X_test = full_data.values[891:]

In [76]:
#We have scaled the data into training and testing sets:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.3,random_state = 12)

We can try by setting different learning rates,so that we can compare the performance of the classifier's at different learning rates

In [80]:
lr_list =[0.05,0.075,0.1,0.25,0.5,0.75,1]
for learning_rate in lr_list:
    gb_clf=GradientBoostingClassifier(n_estimators=20,learning_rate = learning_rate,max_features=2)
    gb_clf.fit(X_train,y_train)
    
    print("Learning rate",learning_rate)
    print("Accuracy score (training):{0:.3f}".format(gb_clf.score(X_train,y_train)))
    print("Accuracy score (validation):{0:.3f}".format(gb_clf.score(X_val,y_val)))

Learning rate 0.05
Accuracy score (training):0.827
Accuracy score (validation):0.750
Learning rate 0.075
Accuracy score (training):0.839
Accuracy score (validation):0.776
Learning rate 0.1
Accuracy score (training):0.831
Accuracy score (validation):0.735
Learning rate 0.25
Accuracy score (training):0.873
Accuracy score (validation):0.776
Learning rate 0.5
Accuracy score (training):0.907
Accuracy score (validation):0.750
Learning rate 0.75
Accuracy score (training):0.904
Accuracy score (validation):0.724
Learning rate 1
Accuracy score (training):0.929
Accuracy score (validation):0.743


In [82]:
from sklearn.metrics import classification_report
gb_clf2 = GradientBoostingClassifier(n_estimators = 20,learning_rate = 0.5,
                                     max_features = 2,max_depth=2)
gb_clf2.fit(X_train,y_train)
predictions = gb_clf2.predict(X_val)
print("Confusion Matrix:")
print(confusion_matrix(y_val,predictions))
print("Classification Report:")
print(classification_report(y_val,predictions))

Confusion Matrix:
[[139  22]
 [ 45  62]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.86      0.81       161
           1       0.74      0.58      0.65       107

    accuracy                           0.75       268
   macro avg       0.75      0.72      0.73       268
weighted avg       0.75      0.75      0.74       268

