# Problem 8.5
# Implemetation of Bagging

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 0. Pre-Processing

In [2]:
def dummies_fit(df):
    dummies_dict = {}
    for column in df.columns:
        if not np.issubdtype(df[column].dtype, np.number):
            dummies_dict[column] = {}
            i = 0
            for value in set(df[column].values):
                dummies_dict[column][value] = i
                i += 1
    return dummies_dict

In [3]:
def dummies_transfer(df, dum_dict):
    for column in df.columns:
        if column in dum_dict.keys():
            for key in dum_dict[column]:
                df.loc[df[column] == key, column] = dum_dict[column][key]

# 1. Accuracy

In [4]:
# accuracy computation
def accuracy(y, pred):
    return np.sum(y == pred) / float(len(y))

# 2. Bagging

In [27]:
def bagging(X, y, num_iter, sample_size, seed):
    """This function returns each h(x) of AdaBoost as well as their weights 
   
    Input: df_train, num_iter
    Outputs: array of trees from DecisionTreeClassifier
             trees_weights array of floats
    Assumes y is {-1, 1}
    """
    trees = []
    
    
    for i in range(num_iter):
        # re-sampling training data with replacement
        index_samples = np.random.choice(X.index, sample_size)
        X_sample, y_sample = X.loc[index_samples], y.loc[index_samples]
        
        h = DecisionTreeClassifier(max_depth=1, random_state=seed) # a decision stamp
        h.fit(X_sample,y_sample) # build the basic learner
        
        trees.append(h) # store basic learner
    
    return trees

In [28]:
def bagging_predict(X, trees):
    """Given X, trees, predict Y
    """
    # X input, y output
    N, _ =  X.shape
    y = np.zeros(N)
    
    for tree in trees:
        y += tree.predict(X)
    y = [1 if a > 0 else -1 for a in y]
        
    return y

# 3. Load Data and Train

In [5]:
data = pd.read_csv('../data/data.txt').drop(['Id'], axis=1)

In [21]:
dummies_dict = dummies_fit(data)
dummies_dict['quality'] = {'bad': -1, 'good': 1}
dummies_transfer(data, dummies_dict)

In [22]:
data.head()

Unnamed: 0,color,root,sound,stripes,umbilical,touch,density,sugar,quality
0,1,1,0,0,1,0,0.697,0.46,1
1,0,1,1,0,1,0,0.744,0.376,1
2,0,1,0,0,1,0,0.634,0.264,1
3,1,1,1,0,1,0,0.608,0.318,1
4,2,1,0,0,1,0,0.556,0.215,1


In [29]:
trees = bagging(data.drop(['quality'],axis=1), data.quality, 3, 10, 0)

In [30]:
bagging_predict(data.drop(['quality'],axis=1), trees)

[1, 1, 1, 1, 1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1]

In [31]:
data.quality

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8    -1
9    -1
10   -1
11   -1
12   -1
13   -1
14   -1
15   -1
16   -1
Name: quality, dtype: int64

In [32]:
accuracy(data.quality, bagging_predict(data.drop(['quality'],axis=1), trees))

0.6470588235294118