In [1]:
# Import guys
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# column names
with open('data/column_names.txt', 'r') as f:
    column_names = [line.strip() for line in f]

sparse_dat = sparse.load_npz("data/sparse_df.npz")

# Extract labels from the first column
labels = sparse_dat[:, 0]

# Create a list of column indices to keep
to_keep = list(set(range(sparse_dat.shape[1])) - set([0]))

# Extract the design matrix
X = sparse_dat[:, to_keep]

In [3]:
# To do - stratify the split 
n_samples = labels.shape[0]
# Use train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, shuffle=False)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.dtype)
print("proportion of spam in training data:", (y_train == 1).sum().item() / y_train.shape[0])
print("proportion of spam in testing data:", (y_test == 1).sum().item() / y_test.shape[0])

(14524, 56213)
(14524, 1)
(3632, 56213)
(3632, 1)
int64
proportion of spam in training data: 0.39568989259157256
proportion of spam in testing data: 0.11921806167400881


In [5]:
def errors(y, y_pred):
        """
        Calculate the proportion of type 2 errors - when the true label is 1 - spam, and the predicted label is 0 - ham

        Args:
        y: true labels
        y_pred: predicted labels
        """
        n = y.shape[0]
        type2errors = ((y == 1) & (y_pred == 0)).sum().item()
        type1errors = ((y == 0) & (y_pred == 1)).sum().item()
        correct = (y_pred == y).sum().item()
        return type2errors, type1errors, correct

In [7]:
param_grid = \
{'maxDTdepth': [5], 'pen_factor': [0.1, 0.3, 0.5, 0.7, 1.2, 1.5, 1.7, 2]}

In [8]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
# 3 dim arrays: array[a][b][c], a - fold, b - depth, c - penalty factor (first element will be no penalty)
all_fold_val_accuracies = []
all_fold_errors = []

fold = 0
for train_index, test_index in kf.split(X_train):
    # each key will be a depth
    all_depth_val_accuracies = {}
    all_depth_errors = {}

    print(f"-------------------FOLD {fold}-------------------")
    
    for maxDTdepth in param_grid['maxDTdepth']:
        all_pen_val_accuracies = {}
        all_pen_errors = {}
        %run AdaBoostWeakClassic.py
        aboost = AdaBoostWeakClassic(type2penalty = False, rounds = 200, maxDTdepth = maxDTdepth)
        aboost.fit(X = X_train[train_index], y = y_train.toarray().ravel()[train_index])

        # training reports
        predictions = aboost.predict(X_train[train_index])
        type2, type1, correct = errors(y_train.toarray().ravel()[train_index], predictions)
        print(f"Model without penalty | rounds: 200 | maxDTdepth: {maxDTdepth}")
        print("Training Accuracy: ", correct/len(predictions))
        print(f'type 2 errors: {type2} \ntype 1 errors: {type1}')

        # testing reports
        predictions = aboost.predict(X_train[test_index])
        type2, type1, correct = errors(y_train.toarray().ravel()[test_index], predictions)
        print("Test Accuracy: ", correct/len(predictions))
        print(f'type 2 errors: {type2} \ntype 1 errors: {type1}\n')

        all_pen_val_accuracies['none'] = correct/len(predictions)
        all_pen_errors['none'] = (type2, type1)
        
        for pen_factor in param_grid['pen_factor']:
            %run AdaBoostWeakClassic.py
            aboost = AdaBoostWeakClassic(type2penalty = True, rounds = 200, maxDTdepth = maxDTdepth, pen_factor=pen_factor)
            aboost.fit(X = X_train[train_index], y = y_train.toarray().ravel()[train_index])
    
            # training reports
            predictions = aboost.predict(X_train[train_index])
            type2, type1, correct = errors(y_train.toarray().ravel()[train_index], predictions)
            print(f"Model with penalty, {pen_factor} | rounds: 200 | maxDTdepth: {maxDTdepth}")
            print("Training Accuracy: ", correct/len(predictions))
            print(f'type 2 errors: {type2} \ntype 1 errors: {type1}')
    
            # testing reports
            predictions = aboost.predict(X_train[test_index])
            type2, type1, correct = errors(y_train.toarray().ravel()[test_index], predictions)
            print("Test Accuracy: ", correct/len(predictions))
            print(f'type 2 errors: {type2} \ntype 1 errors: {type1}\n')

            all_pen_val_accuracies[pen_factor] = correct/len(predictions)
            all_pen_errors[pen_factor] = (type2, type1)

        all_depth_val_accuracies[maxDTdepth] = all_pen_val_accuracies
        all_depth_errors[maxDTdepth] = all_pen_errors

    all_fold_val_accuracies.append(all_depth_val_accuracies)
    all_fold_errors.append(all_depth_errors)
    fold += 1

-------------------FOLD 0-------------------
Model without penalty | rounds: 200 | maxDTdepth: 5
Training Accuracy:  0.9985311668043698
type 2 errors: 0 
type 1 errors: 16
Test Accuracy:  0.9394106306802533
type 2 errors: 58 
type 1 errors: 162

Model with penalty, 0.1 | rounds: 200 | maxDTdepth: 5
Training Accuracy:  0.9985311668043698
type 2 errors: 0 
type 1 errors: 16
Test Accuracy:  0.944643348939686
type 2 errors: 93 
type 1 errors: 108

Model with penalty, 0.3 | rounds: 200 | maxDTdepth: 5
Training Accuracy:  0.9985311668043698
type 2 errors: 0 
type 1 errors: 16
Test Accuracy:  0.9383090057835307
type 2 errors: 87 
type 1 errors: 137

Model with penalty, 0.5 | rounds: 200 | maxDTdepth: 5
Training Accuracy:  0.9985311668043698
type 2 errors: 0 
type 1 errors: 16
Test Accuracy:  0.9410630680253373
type 2 errors: 74 
type 1 errors: 140

Model with penalty, 0.7 | rounds: 200 | maxDTdepth: 5
Training Accuracy:  0.9953180941889287
type 2 errors: 0 
type 1 errors: 51
Test Accuracy:  0

KeyboardInterrupt: 

In [None]:
# Initialize a dictionary to store the sum of accuracies and count for each model configuration
model_performance = {}

# Iterate through each fold's performance data
for fold_accuracies, fold_errors in zip(all_fold_val_accuracies, all_fold_errors):
    for depth, penalties in fold_accuracies.items():
        for penalty, accuracy in penalties.items():
            # Create a unique key for each model configuration
            type2_errors = fold_errors[depth][penalty][0]
            key = (depth, penalty)
            if key not in model_performance:
                model_performance[key] = {'total_accuracy': 0, 'type2errors': 0, 'count': 0}
            # Sum the accuracies and count the occurrences for each model configuration
            model_performance[key]['total_accuracy'] += accuracy
            model_performance[key]['type2errors'] += type2_errors
            model_performance[key]['count'] += 1

# Calculate the average accuracy for each model configuration
for key, value in model_performance.items():
    model_performance[key]['average_accuracy'] = value['total_accuracy'] / value['count']
    model_performance[key]['type2errors'] = value['type2errors'] / value['count']

# Find the best model configuration based on the highest average accuracy
best_model_key = max(model_performance, key=lambda k: model_performance[k]['average_accuracy'])
best_model_depth, best_model_penalty = best_model_key
best_model_average_accuracy = model_performance[best_model_key]['average_accuracy']

print("Best Model based on validation accuracy across folds:")
print(f"Best model depth: {best_model_depth}")
print(f"Best model penalty factor: {best_model_penalty}")
print(f"Best model average accuracy across folds: {best_model_average_accuracy}")
print(f"Best model average type 2 errors across folds: {model_performance[best_model_key]['type2errors']}")

best_model_key = min(model_performance, key=lambda k: model_performance[k]['type2errors'])
best_model_depth, best_model_penalty = best_model_key
best_model_average_accuracy = model_performance[best_model_key]['average_accuracy']

print("Best Model based on smallest average number of type 2 errors across folds:")
print(f"Best model depth: {best_model_depth}")
print(f"Best model penalty factor: {best_model_penalty}")
print(f"Best model average accuracy across folds: {best_model_average_accuracy}")
print(f"Best model average type 2 errors across folds: {model_performance[best_model_key]['type2errors']}")

In [None]:
import matplotlib.pyplot as plt

# Extract penalty factors, type 2 errors, and total accuracy
penalties = [pen for (_, pen) in model_performance.keys()]
type2errors = [model_performance[(depth, pen)]['type2errors'] for depth, pen in model_performance]
total_accuracy = [model_performance[(depth, pen)]['average_accuracy'] for depth, pen in model_performance]

# Plotting

fig,ax = plt.subplots()
ax.plot(penalties, type2errors, color='red', label='Type 2 Errors')
ax.set_ylabel('Type 2 Errors', color='red', fontsize=16)

ax.set_xlabel('Penalty Factor', fontsize=14)

ax2 = ax.twinx()

ax2.plot(penalties, total_accuracy, color='blue', label='Total Accuracy')
ax2.set_ylabel('Total Accuracy', color='blue', fontsize=16)

# plt.figure(figsize=(10, 5))
# plt.plot(penalties, type2errors, color='red', label='Type 2 Errors')
# plt.plot(penalties, total_accuracy, color='blue', label='Total Accuracy')

# Labeling the axes

plt.title('Model Performance', fontsize=18)
plt.show()