# Problem 8.3
# Implemetation of AdaBoost

In [21]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 0. Pre-Processing

In [5]:
def dummies_fit(df):
    dummies_dict = {}
    for column in df.columns:
        if not np.issubdtype(df[column].dtype, np.number):
            dummies_dict[column] = {}
            i = 0
            for value in set(df[column].values):
                dummies_dict[column][value] = i
                i += 1
    return dummies_dict

In [6]:
def dummies_transfer(df, dum_dict):
    for column in df.columns:
        if column in dum_dict.keys():
            for key in dum_dict[column]:
                df.loc[df[column] == key, column] = dum_dict[column][key]

# 1. Accuracy

In [2]:
# accuracy computation
def accuracy(y, pred):
    return np.sum(y == pred) / float(len(y))

# 2. AdaBoost

In [63]:
def adaboost(X, y, num_iter):
    """This function returns each h(x) of AdaBoost as well as their weights 
   
    Input: df_train, num_iter
    Outputs: array of trees from DecisionTreeClassifier
             trees_weights array of floats
    Assumes y is {-1, 1}
    """
    trees = []
    trees_weights = [] 
    N, _ = X.shape
    d = np.ones(N) / N
    
    for i in range(num_iter):
        h = DecisionTreeClassifier(max_depth=200, random_state=0) # a fully-grown tree
        h.fit(X,y, sample_weight = d) # build the basic learner
        error = 1- h.score(X,y,sample_weight=d) # calculate weighted error rate
        if error > 0:
            weight = np.log((1-error)/error) # calculate alpha
        else : weight = np.inf 
        trees.append(h) # store basic learner
        trees_weights.append(weight) # store alpha
        
        # update weights
        prediction = h.predict(X)
        d = [d[i] if prediction[i] == y.iloc[i] else d[i]*np.exp(weight) for i in range(len(d))]
    
    return trees, trees_weights

In [59]:
def adaboost_predict(X, trees, trees_weights):
    """Given X, trees and weights predict Y
    """
    # X input, y output
    N, _ =  X.shape
    y = np.zeros(N)
    
    for i in range(len(trees)):
        tree, weight = trees[i], trees_weights[i]
        y += weight*tree.predict(X)
    y = [1 if a > 0 else -1 for a in y]
        
    return y

# 3. Load Data and Train

In [3]:
data = pd.read_csv('../data/data.txt').drop(['Id'], axis=1)

In [24]:
data_tr = data.iloc[[0,1,2,3,5,6,9,12,13,14,15,16]]

dummies_dict = dummies_fit(data_tr)
dummies_dict['quality'] = {'bad': -1, 'good': 1}
dummies_transfer(data_tr, dummies_dict)

data_test = data.drop(data_tr.index)
dummies_transfer(data_test, dummies_dict)
data_test_x, data_test_y = data_test.drop(['quality'],axis=1), data_test.quality

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [26]:
data_tr.head()

Unnamed: 0,color,root,sound,stripes,umbilical,touch,density,sugar,quality
0,1,1,0,0,1,0,0.697,0.46,1
1,0,1,1,0,1,0,0.744,0.376,1
2,0,1,0,0,1,0,0.634,0.264,1
3,1,1,1,0,1,0,0.608,0.318,1
5,1,0,0,0,2,1,0.403,0.237,1


In [64]:
trees, weights = adaboost(data_tr.drop(['quality'],axis=1), data_tr.quality, 5)

In [65]:
adaboost_predict(data_test_x, trees, weights)

[-1, -1, -1, -1, 1]

In [66]:
data_test_y

4     1
7     1
8    -1
10   -1
11   -1
Name: quality, dtype: int64

In [67]:
accuracy(data_test_y, adaboost_predict(data_test_x, trees, weights))

0.40000000000000002