In [6]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn import svm
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
import chardet
import re
import os
from sklearn.tree import DecisionTreeClassifier

In [7]:
# Compute error rate, alpha and w
def errorcal(y, y_pred, w_i):
    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

def alpha_comp(error):
    return np.log((1 - error) / error)

def weight_updater(w_i, alpha, y, y_pred):  
    return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))

In [8]:
class Boost_ada:
    
    def __init__(self):
        self.alphas = []
        self.G_M = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        for m in range(0, M):
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)
            else:
                w_i = weight_updater(w_i, alpha_m, y, y_pred)
            
            G_m = DecisionTreeClassifier(max_depth = 1)
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)
            
            self.G_M.append(G_m)

            error_m = errorcal(y, y_pred, w_i)
            self.training_errors.append(error_m)

            alpha_m = alpha_comp(error_m)
            self.alphas.append(alpha_m)

        assert len(self.G_M) == len(self.alphas)
        
    def predict(self, X):
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 

        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

In [9]:
df = pd.read_csv('spambase.data', header = None)

names = pd.read_csv('spambase.names', sep = ':', skiprows=range(0, 33), header = None)
col_names = list(names[0])
col_names.append('Spam')

df.columns = col_names

df['Spam'] = df['Spam'] * 2 - 1

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'Spam').values, df['Spam'].values, train_size = 3065, random_state = 2)

In [10]:
ab = Boost_ada()
ab.fit(X_train, y_train, M = 400)

y_pred = ab.predict(X_test)

temp = np.sum(np.abs(y_test-y_pred))  
print(y_test.shape)     
print((1-(temp/y_pred.shape))*100)

(1536,)
[88.80208333]
