### In this lab, we will be using a Voting Classifier in which the ensemble model makes the prediction by majority vote. For example, if we use three models and they predict [1, 0, 1] for the target variable, the final prediction that the ensemble model would make would be 1, since two out of the three models predicted 1.

### We will use three different models to put into our Voting Classifier: Random Forest, Boosting tree and Logistic Regression. We will use the Scikit-learn library in Python to implement these methods and use the diabetes dataset in our example.

In [1]:
import pandas as pd
import numpy as np
#read in the dataset
df = pd.read_csv('Diabetes.csv')
#take a look at the data
df.head()

Unnamed: 0,id,preg,plas,pres,skin,insu,mass,pedi,age,Class
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# define features and target
features = ['preg','plas','pres','skin','insu','mass','pedi','age']
X = df[features]
y = df['Class']

In [3]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [4]:
from sklearn.model_selection import GridSearchCV
# Exhaustive search over specified parameter values for an estimator
#GridSearchCV implements a “fit” and a “score” method
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

# Build Models

## use KNN model as the benchmark model

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
#create new a knn model
knn = KNeighborsClassifier()

In [7]:
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}

In [8]:
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=10)

In [9]:
knn_gs.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])})

In [10]:
# print n which generate best accuracy
print(knn_gs.best_params_)

{'n_neighbors': 18}


In [11]:
# using n = 18 to create our knn model
knn_best = KNeighborsClassifier(n_neighbors=18)

In [12]:
# cross validation accuray from best knn model
accuracy_kNN = cross_val_score(knn_best,X,y,cv=10).mean()
accuracy_kNN

0.7554340396445658

## 1. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

# a diverse set of classifiers is created by introducing randomness in the classifier construction. 
# The prediction of the ensemble is given as the averaged prediction of the individual classifiers
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier


In [14]:
#create a new random forest classifier
rf = RandomForestClassifier()

### Random Forest Parameter
#### 1. n_estimators : The number of trees in the forest. Default = 10

In [15]:
accuracy = cross_val_score (rf, X, y, cv=10).mean()
accuracy

0.764524948735475

In [16]:
#create a dictionary of all values we want to test for n_estimators, number of trees
params_rf = {'n_estimators': [50, 100, 200]}

In [17]:
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)

In [18]:
#fit model to training data
rf_gs.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [50, 100, 200]})

In [19]:
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 100}


In [20]:
# create best model of random forest in term of number of trees
rf_best = RandomForestClassifier(n_estimators = 200)

In [21]:
# fit the random forest
rf_best.fit(X,y)

RandomForestClassifier(n_estimators=200)

In [22]:
accuracy_rf = cross_val_score (rf_best, X, y, cv=10).mean()
accuracy_rf

0.7619275461380723

In [23]:
# Return the feature importances (the higher, the more important the feature)
print(rf_best.feature_importances_)

[0.08319263 0.25894012 0.09157692 0.06783479 0.07239876 0.1668392
 0.12447275 0.13474483]


In [24]:
print(rf_best.estimators_)

[DecisionTreeClassifier(max_features='auto', random_state=1097926426), DecisionTreeClassifier(max_features='auto', random_state=2087696221), DecisionTreeClassifier(max_features='auto', random_state=1269729456), DecisionTreeClassifier(max_features='auto', random_state=583263753), DecisionTreeClassifier(max_features='auto', random_state=114945964), DecisionTreeClassifier(max_features='auto', random_state=1867266078), DecisionTreeClassifier(max_features='auto', random_state=766983807), DecisionTreeClassifier(max_features='auto', random_state=1275429372), DecisionTreeClassifier(max_features='auto', random_state=2029), DecisionTreeClassifier(max_features='auto', random_state=1681690716), DecisionTreeClassifier(max_features='auto', random_state=432535698), DecisionTreeClassifier(max_features='auto', random_state=512653505), DecisionTreeClassifier(max_features='auto', random_state=1095604175), DecisionTreeClassifier(max_features='auto', random_state=203888164), DecisionTreeClassifier(max_feat

## 2. AdaBoost Model

In [27]:
from sklearn.ensemble import AdaBoostClassifier

#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

In [28]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50, learning_rate=0.5)

### AdaBoost Parameter
#### 1.base_estimator: The base estimator from which the boosted ensemble is built, If None, then the base estimator is DecisionTreeClassifier(max_depth=1)
#### 2. n_estimators: The maximum number of estimators at which boosting is terminated
#### 3. learning_rate: Learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learning_rate and n_estimators. 

In [29]:
accuracy_abc = cross_val_score(abc,X,y,cv=10).mean()
accuracy_abc

0.7632604237867395

## Voting Classifier

In [30]:
from sklearn.ensemble import VotingClassifier
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html


In [31]:
#create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('abc', abc)]

In [32]:
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

# If ‘hard’, uses predicted class labels for majority rule voting

In [33]:
#fit model to training data
ensemble.fit(X, y)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=18)),
                             ('rf', RandomForestClassifier(n_estimators=200)),
                             ('abc', AdaBoostClassifier(learning_rate=0.5))])

In [34]:
Accuracy_ensemble = cross_val_score (ensemble,X,y,cv=10).mean()
Accuracy_ensemble

0.7633287764866712