# KAIM 2018

## Ensemble Methods - Bagging and Boosting
### Anand Subramanian

In [1]:
import numpy as np
import pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2
import warnings
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
warnings.filterwarnings("ignore", category=DeprecationWarning) 
np.random.seed(8088)

In [2]:
class Bagging(object):
    def __init__(self, model, x_data, y_data):
        self.x_data = np.array(x_data)
        self.y_data = np.array(y_data)
        self.N = self.x_data.shape[0]
        self.model = model
    
    def bagging_train(self, Num_learners = 1):
        self.learners = []
        for k in range(Num_learners):
            learner = self.model()
            for i in range(10):
                # Bootstrap data
                idx = np.random.randint(self.N, size = int(self.N/Num_learners))
                x_train = self.x_data[idx, :]
                y_train = self.y_data[idx]
                
                # Train Classifiers
                train_model = learner.fit(x_train, y_train)
            self.learners.append(learner)
            
    def get_predictions(self,x_test, y_test):
        y_pred = np.empty([x_test.shape[0], 0])
       
        for i, learner in enumerate(self.learners):
            y_pred = np.hstack((y_pred, learner.predict(x_test).reshape(-1,1)))
            
        # Plurality Voting
        y_pred += 1
        preds = []
        for row in y_pred:
            preds.append(np.argmax(np.bincount(row.astype(int)))-1)
        self.test_accuracy = 100*(y_test == preds).sum()/y_test.shape[0]
        return preds, self.test_accuracy      
        
"""==================================================================================================="""
class Adaboost(object):
    def __init__(self, model, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data
        self.N = self.x_data.shape[0]
        self.weights = np.ones(self.N)/self.N
        self.eps = []
        self.alpha = []
        self.learners = []
        self.model = model
        
    def boost(self, Num_learners= 50):
        self.weights = np.ones(self.N)/self.N
        self.eps = []
        self.alpha = []
        self.learners = []
        for k in range(Num_learners): 
            learner = self.model(max_depth = 1, random_state = 1)
            #Train the classifier
            train_model = learner.fit(self.x_data, self.y_data, sample_weight=self.weights)
            
            # Get predictions
            y_pred = learner.predict(self.x_data)
            
            e_k = np.sum((y_pred != self.y_data)*self.weights)
            self.eps.append(e_k)
            
            alpha_k = 0.5*np.log((1 - e_k) / float(e_k))
            self.alpha.append(alpha_k)
            
            # Update the weights
            I = np.array([1.0 if x == True else -1.0 for x in (y_pred != self.y_data)])
            self.weights = np.multiply(self.weights, np.exp(alpha_k*I))
            self.weights = self.weights/ (np.sum(self.weights))
            
            # added learner to the list
            self.learners.append(learner)
            
    def get_predictions(self, x_test, y_test):
        y_pred = np.zeros(x_test.shape[0])
        for i, learner in enumerate(self.learners):
            #print(learner.predict(x_test).shape, y_pred.shape, self.alpha[i])
            y_pred += self.alpha[i]*learner.predict(x_test)
        
        y_pred = np.sign(y_pred)
        # calculate test accuracy
        self.test_accuracy = 100*(y_test == y_pred).sum()/y_test.shape[0]
        return y_pred, self.test_accuracy  

In [3]:
x, y = make_hastie_10_2()
df = pd.DataFrame(x)
df['Y'] = y

# Split into training and test set
train, test = train_test_split(df, test_size = 0.2)
X_train, Y_train = train.ix[:,:-1], train.ix[:,-1]
X_test, Y_test = test.ix[:,:-1], test.ix[:,-1]

In [None]:
# Bagging
clf_tree = DecisionTreeClassifier()
ensemble = Bagging(DecisionTreeClassifier,X_train, Y_train)

ensemble.bagging_train(Num_learners= 70)  
y_pred, test_accuracy = ensemble.get_predictions(X_test, Y_test)
print('Test Accuracy of Bagged Decision Trees : %.4f %% '%test_accuracy)

In [4]:
# Boosting
clf_tree = DecisionTreeClassifier(max_depth = 1, random_state = 1)
ensemble = Adaboost(DecisionTreeClassifier,X_train, Y_train)

# Without Boosting (Single Classifier)
ensemble.boost(Num_learners= 1)    
y_pred, test_accuracy = ensemble.get_predictions(X_test, Y_test)
print('Test Accuracy of Decision Stump : %.4f %% '%test_accuracy)

# With Boosting (Miltuple Cl)
ensemble.boost(Num_learners= 400)    
y_pred, test_accuracy =ensemble.get_predictions(X_test, Y_test)
print('Test Accuracy of Boosted Decision Trees : %.4f %% '%test_accuracy)

Test Accuracy of Decision Stump : 53.7917 % 
Test Accuracy of Boosted Decision Trees : 90.5000 % 
