### In this lab, we will be using a Voting Classifier in which the ensemble model makes the prediction by majority vote. For example, if we use three models and they predict [1, 0, 1] for the target variable, the final prediction that the ensemble model would make would be 1, since two out of the three models predicted 1.

### We will use three different models to put into our Voting Classifier: Random Forest, Boosting tree and Logistic Regression. We will use the Scikit-learn library in Python to implement these methods and use the diabetes dataset in our example.

In [1]:
import pandas as pd
import numpy as np
#read in the dataset
df = pd.read_csv('Diabetes.csv')
#take a look at the data
df.head()

Unnamed: 0,id,preg,plas,pres,skin,insu,mass,pedi,age,Class
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# define features and target
features = ['preg','plas','pres','skin','insu','mass','pedi','age']
X = df[features]
y = df['Class']

In [3]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [4]:
from sklearn.model_selection import GridSearchCV
# Exhaustive search over specified parameter values for an estimator
#GridSearchCV implements a “fit” and a “score” method
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

# Build Models

## use KNN model as the benchmark model

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
#create new a knn model
knn = KNeighborsClassifier()

In [7]:
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}

In [8]:
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=10)

In [9]:
knn_gs.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [10]:
# print n which generate best accuracy
print(knn_gs.best_params_)

{'n_neighbors': 18}


In [11]:
# using n = 18 to create our knn model
knn_best = KNeighborsClassifier(n_neighbors=18)

In [12]:
# cross validation accuray from best knn model
accuracy_kNN = cross_val_score(knn_best,X,y,cv=10).mean()
accuracy_kNN

0.7554340396445658

## 1. Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

# a diverse set of classifiers is created by introducing randomness in the classifier construction. 
# The prediction of the ensemble is given as the averaged prediction of the individual classifiers
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier


In [14]:
#create a new random forest classifier
rf = RandomForestClassifier()

### Random Forest Parameter
#### 1. n_estimators : The number of trees in the forest. Default = 10

In [15]:
accuracy = cross_val_score (rf, X, y, cv=10).mean()
accuracy

0.7552973342447028

In [16]:
#create a dictionary of all values we want to test for n_estimators, number of trees
params_rf = {'n_estimators': [50, 100, 200]}

In [17]:
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)

In [18]:
#fit model to training data
rf_gs.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [19]:
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

{'n_estimators': 200}


In [20]:
# create best model of random forest in term of number of trees
rf_best = RandomForestClassifier(n_estimators = 200)

In [21]:
# fit the random forest
rf_best.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
accuracy_rf = cross_val_score (rf_best, X, y, cv=10).mean()
accuracy_rf

0.7592617908407383

In [23]:
# Return the feature importances (the higher, the more important the feature)
print(rf_best.feature_importances_)

[0.08758301 0.25752753 0.08911873 0.06818433 0.07204618 0.16664571
 0.12739444 0.13150007]


In [24]:
print(rf_best.estimators_)

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1885468558, splitter='best'), DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1390933344, splitter='best'), DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=

## 2. AdaBoost Model

In [25]:
from sklearn.ensemble import AdaBoostClassifier

#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

In [26]:
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50, learning_rate=0.5)

### AdaBoost Parameter
#### 1.base_estimator: The base estimator from which the boosted ensemble is built, If None, then the base estimator is DecisionTreeClassifier(max_depth=1)
#### 2. n_estimators: The maximum number of estimators at which boosting is terminated
#### 3. learning_rate: Learning rate shrinks the contribution of each classifier by learning_rate. There is a trade-off between learning_rate and n_estimators. 

In [27]:
accuracy_abc = cross_val_score(abc,X,y,cv=10).mean()
accuracy_abc

0.7632604237867395

## Voting Classifier

In [28]:
from sklearn.ensemble import VotingClassifier
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html


In [29]:
#create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('abc', abc)]

In [30]:
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

# If ‘hard’, uses predicted class labels for majority rule voting

In [31]:
#fit model to training data
ensemble.fit(X, y)

VotingClassifier(estimators=[('knn',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_size=30,
                                                   metric='minkowski',
                                                   metric_params=None,
                                                   n_jobs=None, n_neighbors=18,
                                                   p=2, weights='uniform')),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=Non

In [32]:
Accuracy_ensemble = cross_val_score (ensemble,X,y,cv=10).mean()
Accuracy_ensemble

0.7645933014354067