# Adaboost classifier

In [17]:
# Import guys
import pandas as pd
import numpy as np
# from AdaBoostClassifier import AdaBoostClassifier
%run AdaBoostClassifier.py
from sklearn.model_selection import train_test_split



In [18]:
# Data
dat = pd.read_csv('data/cleanData.csv')

In [19]:
labels = dat["Category"]
dat.drop(['Category', 'cleaned_msg', 'nondupe'], inplace=True, axis = 1)
columns = dat.columns
dat = dat.to_numpy()
print(labels)

0      0
1      0
2      1
3      0
4      0
      ..
315    0
316    0
317    0
318    0
319    1
Name: Category, Length: 320, dtype: int64


### Train/test

In [20]:
n_samples = len(labels)
# Use train_test_split.
X_train, X_test, y_train, y_test = train_test_split(dat, labels, test_size=0.2, shuffle=False)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.dtype)
print("proportion of spam in training data:", (y_train == 1).sum().item() / len(y_train))
print("proportion of spam in testing data:", (y_test == 1).sum().item() / len(y_test))

(256, 47994)
(256,)
(64, 47994)
(64,)
int64
proportion of spam in training data: 0.1484375
proportion of spam in testing data: 0.15625


# Model Training + Test Set Evaluation

In [21]:
def errors(y, y_pred):
        """
        Calculate the proportion of type 2 errors - when the true label is 1 - spam, and the predicted label is 0 - ham

        Args:
        y: true labels
        y_pred: predicted labels
        """
        n = len(y)
        type2errors = ((y == 1) & (y_pred == 0)).sum().item()
        type1errors = ((y == 0) & (y_pred == 1)).sum().item()
        correct = (y_pred == y).sum().item()
        return type2errors, type1errors, correct

In [22]:
aboost1 = AdaBoostClassifier(n_estimators = 50, lr = 0.01, type2penalty = False, max_DT_depth = None)
aboost1.fit(X = X_train, y = y_train)

predictions = aboost1.predict(X_test)
type2, type1, correct = errors(y_test, predictions)
print("Accuracy: ", correct/len(predictions))
print(f'unique predictions - should be 0 and 1: {np.unique(predictions)}')
print(f'type 2 errors: {type2} \n type 1 errors: {type1}')
print(aboost1.weights)

Accuracy:  0.953125
unique predictions - should be 0 and 1: [0. 1.]
type 2 errors: 3 
 type 1 errors: 0
[0.00384693 0.00384693 0.00392464 0.00384693 0.00384693 0.00540473
 0.00384693 0.00384693 0.00384693 0.00425151 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00400393 0.00384693 0.00384693
 0.00384693 0.00392464 0.00384693 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00400393 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693
 0.00416733 0.00384693 0.0049892  0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.0043374
 0.00384693 0.00384693 0.00609382 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693 0.00384693
 0.00384693 0.00384693 0.0

In [23]:
aboost2 = AdaBoostClassifier(n_estimators = 50, lr = 0.01, type2penalty = True, max_DT_depth = None)
aboost2.fit(X = X_train, y = y_train)
predictions = aboost2.predict(X_test)
type2, type1, correct = errors(y_test, predictions)
print("Accuracy: ", correct/len(predictions))
print(f'unique predictions - should be 0 and 1: {np.unique(predictions)}')
print(f'type 2 errors: {type2} \n type 1 errors: {type1}')
print(aboost2.weights)

Accuracy:  0.953125
unique predictions - should be 0 and 1: [0. 1.]
type 2 errors: 3 
 type 1 errors: 0
[0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00451814
 0.00385011 0.00385011 0.00385011 0.00408819 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00408819 0.00385011 0.00385011
 0.00385011 0.00392789 0.00385011 0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00385011 0.00392789 0.00392789
 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011
 0.00434098 0.00385011 0.0054092  0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00385011 0.00385011 0.00408819
 0.00385011 0.00385011 0.0051971  0.00385011 0.00385011 0.00385011
 0.00385011 0.00385011 0.00385011 0.00392789 0.00385011 0.00385011
 0.00385011 0.00385011 0.

# Training Set Evaluation

In [24]:
predictions = aboost1.predict(X_train)
type2, type1, correct = errors(y_train, predictions)
print("Model 1 (no penalty) Training set Accuracy: ", correct/len(predictions))
print(f'unique predictions - should be 0 and 1: {np.unique(predictions)}')
print(f'type 2 errors: {type2} \n type 1 errors: {type1}')

Model 1 (no penalty) Training set Accuracy:  0.96484375
unique predictions - should be 0 and 1: [0. 1.]
type 2 errors: 0 
 type 1 errors: 9


In [25]:
predictions = aboost2.predict(X_train)
type2, type1, correct = errors(y_train, predictions)
print("Model 2 (penalty) Training set Accuracy: ", correct/len(predictions))
print(f'unique predictions - should be 0 and 1: {np.unique(predictions)}')
print(f'type 2 errors: {type2} \n type 1 errors: {type1}')

Model 2 (penalty) Training set Accuracy:  0.96484375
unique predictions - should be 0 and 1: [0. 1.]
type 2 errors: 0 
 type 1 errors: 9


just for testing

In [26]:
X = np.array([[1, 2], [3, 4], [5, 6]])
y = np.ones(3)

clf = AdaBoostClassifier(n_estimators=10, lr=0.01)
clf.fit(X, y)

predictions = clf.predict(X)
print(predictions)

[1. 1. 1.]
