In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

class Bagging:
    def __init__(self, t=50, depth=3):
        self.t = t
        self.depth = depth
        
    def train(self, X, y):
        self.models = [DecisionTreeClassifier(max_depth=self.depth) for _ in range(self.t)]
        print(f'Training {self.t} decision trees!')
        for i in range(self.t):
            sampled_X = X.sample(frac=1, replace=True)
            sampled_y = y[sampled_X.index]
            self.models[i].fit(sampled_X, sampled_y)
        return self.models
    
    def predict(self, X):
        preds = np.array([model.predict(X) for model in self.models])
        preds = np.sum(preds, axis=0)
        return (preds > 0) * 1
        

In [4]:
from sklearn.model_selection import train_test_split
spambase = pd.read_csv("spambase.data")

spambase_train, spambase_test = train_test_split(spambase, test_size=0.3)
spambase_train_y = spambase_train.pop('is_spam')
spambase_test_y = spambase_test.pop('is_spam')

In [5]:
test = Bagging(depth=4)
test.train(spambase_train, spambase_train_y)
spambase_bag_preds = test.predict(spambase_test)

Training 50 decision trees!


In [6]:
def compare_results(pred_y, y):
    df = pd.DataFrame({'preds': pred_y, 'labels': y})
    tp = ((df['labels'] == 1) & (df['preds'] == 1)).sum()
    tn = ((df['labels'] == 0) & (df['preds'] == 0)).sum()
    fp = ((df['labels'] == 0) & (df['preds'] == 1)).sum()
    fn = ((df['labels'] == 1) & (df['preds'] == 0)).sum()
    print(f'Accuracy: {(tp + tn) / len(y)}, Error: {(fp + fn) / len(y)}')
    print(f'TPR: {tp / (tp + fn)}, FPR: {fp / (fp + tn)}')
    return

In [7]:
compare_results(spambase_bag_preds, spambase_test_y)

Accuracy: 0.8146270818247646, Error: 0.18537291817523532
TPR: 0.9377289377289377, FPR: 0.26586826347305387


I was not able to match my AdaBoost performance (92% accuracy) with Bagging (87% accuracy), but it's not that far off.

In [19]:
from sklearn.tree import DecisionTreeRegressor

class GradientBoostedTrees:
    def __init__(self, i=10, depth=3):
        self.i = i
        self.depth = depth
        return
    
    def train(self, X, y):
        labels = np.copy(y)
        self.models = [DecisionTreeRegressor(max_depth=self.depth) for _ in range(self.i)]
        for model in self.models:
            model.fit(X, labels)
            preds = model.predict(X)
            labels = labels - preds
        return
    
    def predict(self, X):
        preds = np.array([model.predict(X) for model in self.models])
        return np.sum(preds, axis=0) 
    
    def predict_each_model(self, X):
        return np.array([model.predict(X) for model in self.models])
    
    def predict_with_models(self, X, i):
        preds = np.array([model.predict(X) for model in self.models[0:i]])
        print(preds.shape)
        return np.sum(preds, axis=0)

In [9]:
house_set_columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

house_training_df = pd.read_csv('housing_train.txt', delim_whitespace=True, names=house_set_columns)
house_testing_df = pd.read_csv('housing_test.txt', delim_whitespace=True, names=house_set_columns)

# combine dataset for normalization
house_combined_df = pd.concat([house_training_df, house_testing_df], axis=0)

# separate labels before normalization
house_combined_labels = house_combined_df.pop(house_combined_df.columns[-1])

def normalize_df(df):
    mean = df.mean()
    std = df.std() 
    return (df - mean) / std

house_combined_df = normalize_df(house_combined_df)

# undo concatenation
training_len = len(house_training_df)
house_training_df = house_combined_df.iloc[:training_len,:]
house_training_labels = house_combined_labels.iloc[:training_len]

house_testing_df = house_combined_df.iloc[training_len:,:]
house_testing_labels = house_combined_labels.iloc[training_len:]

In [21]:
test2 = GradientBoostedTrees()
test2.train(house_training_df, house_training_labels)
house_train_preds = test2.predict(house_training_df)
house_preds = test2.predict(house_testing_df)

def MSE(preds, y):
    diffs = y - preds
    diffs_squared = diffs ** 2
    return diffs_squared.mean()
    
print(f'Train MSE: {MSE(house_train_preds, house_training_labels)}')
print(f'Test MSE: {MSE(house_preds, house_testing_labels)}')

Train MSE: 3.303818791006094
Test MSE: 38.043086457812485


In [22]:
house_train_all_preds = [test2.predict_with_models(house_training_df, i) for i in range(1, 11)]
house_test_all_preds = [test2.predict_with_models(house_testing_df, i) for i in range(1, 11)]
train_mses = []
for row in house_train_all_preds:
    train_mses.append(MSE(row, house_training_labels))
test_mses = []
for row in house_test_all_preds:
    test_mses.append(MSE(row, house_testing_labels))
print(f'Individual training MSEs: {train_mses}')
print(f'Individual testing MSEs: {test_mses}')

(1, 433)
(2, 433)
(3, 433)
(4, 433)
(5, 433)
(6, 433)
(7, 433)
(8, 433)
(9, 433)
(10, 433)
(1, 74)
(2, 74)
(3, 74)
(4, 74)
(5, 74)
(6, 74)
(7, 74)
(8, 74)
(9, 74)
(10, 74)
Individual training MSEs: [15.732147242327171, 10.463519041324895, 7.976053235216596, 6.734890063434734, 5.801848522210734, 5.110293798586951, 4.87164174650844, 4.252660125406961, 3.8419434198597555, 3.303818791006094]
Individual testing MSEs: [52.282508772643745, 48.365605634244254, 48.00704499125136, 37.61864699572298, 33.18555597496225, 34.382043611725635, 37.9061049203715, 37.3464071953299, 38.60287065641471, 38.043086457812485]
