In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# reads in data
data = pd.read_csv("spambase.data", header=None)

# gets fetaures and last column as target
x = data.iloc[:, :-1]  # All columns except the last
y = data.iloc[:, -1]   # Last column

# missing values
print(data.isnull().sum())

# standardizes the features
scaler = StandardScaler()
xscale = scaler.fit_transform(x)

# splist the dtat set 90% to training and 10% to testing
xtrain, xtest, ytrain, ytest = train_test_split(xscale, y, test_size=0.1, random_state=0)


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
51    0
52    0
53    0
54    0
55    0
56    0
57    0
dtype: int64


In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np

# function trains the models and records the metrics
def train(model, xtrain, ytrain, xtest, ytest):
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    accuracy = accuracy_score(ytest, ypred)
    precision = precision_score(ytest, ypred)
    recall = recall_score(ytest, ypred)
    return [accuracy, precision, recall]

# these are the different type of models we will be evaluating
models = [
    MLPClassifier(hidden_layer_sizes=(5,), activation='logistic', max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(10, 10), activation='logistic', max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(5,), activation='relu', max_iter=1000, random_state=0),
    MLPClassifier(hidden_layer_sizes=(10, 10), activation='relu', max_iter=1000, random_state=0)
]

# results array
results = np.zeros((4, 3))

# for each model it trains and gives results
for i, model in enumerate(models):
    results[i] = train(model, xtrain, ytrain, xtest, ytest)

# results
print("Accuracy, Precision, Recall for each model:\n", results)


Accuracy, Precision, Recall for each model:
 [[0.93926247 0.94736842 0.90909091]
 [0.92624729 0.89805825 0.93434343]
 [0.93709328 0.93333333 0.91919192]
 [0.9197397  0.90452261 0.90909091]]


In [None]:
"""
Discussion:

1. Warnings:
   My code did not produce any significant warnings except for an iteration warning in some cases when I was testing for best models. 
   For example, some models may produce a convergence warning when using `MLPClassifier`. 
   This happens when the model fails to converge within the specified number of iterations (ie max_iter = 1000).
   This issue was more common in models with thats more complex (with mor ehidden layers) because these models require more iterations to converge compared to simpler ones (single layers and less neurons).
   
2. Precision vs. Recall in Spam Email Classification:
   Precision is more important than recall in a spam email learning task because precision measures how many emails classified as spam are actually spam. In this context, it's more critical to minimize false positives since users would be more frustrated if important emails were wrongly classified as spam and sent to the spam folder.
   On the other hand, recall measures how many actual spam emails are correctly classified. While recall is important, missing some spam, false negatives, is generally more acceptable than incorrectly classifying legitimate emails as spam (false positives).
   
   This is a tradeoff: If we aim to improve precision, we may reduce recall. Conversely, improving recall might reduce precision, leading to more false positives. Hence, the best model depends on how sensitive we want to be to false positives (low precision) vs. false negatives (low recall).
"""


In [13]:

bonusmodel = MLPClassifier(hidden_layer_sizes=(5,), activation='relu', max_iter=1000, learning_rate_init=0.0005, alpha=0.0001, random_state=0)

bonusmodel.fit(xtrain, ytrain)

bonusy_pred = bonusmodel.predict(xtest)

bonusmetrics = [
    accuracy_score(ytest, bonusy_pred),
    precision_score(ytest, bonusy_pred),
    recall_score(ytest, bonusy_pred)
]

# results
print("Bonus model performance: Accuracy, Precision, Recall\n", bonusmetrics)


Bonus model performance: Accuracy, Precision, Recall
 [0.9414316702819957, 0.9384615384615385, 0.9242424242424242]
