# Spam Detection using Autoencoder-Based Learning

Using the provided dataset, the machine learning program is capable of detecting whether or not an email should be considered spam.

Dataset:https://www.kaggle.com/datasets/balaka18/email-spam-classification-dataset-csv <br>
Citation: Biswas, Balaka Kaggle Email Spam Classification Dataset CSV (2020)

Imports

In [None]:
# Handling Data Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Evaluation Metrics Imports
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Preprocessing Imports
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder

# Import Neural Network Libraries
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.models import Model
from keras.utils import plot_model, to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as pyplot
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.models import Model
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

Load Dataset

In [None]:
# --- Loading Dataset ---
# Kaggle Email Spam Classification Dataset CSV, 3002 features, 5172 samples, Binary Classification (non-spam = 0, spam = 1)
dataset = pd.read_csv('emails.csv')

# Index is not needed
dataset.drop('Email No.', axis=1, inplace=True)

In [None]:
dataset

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,2,2,2,3,0,0,32,0,0,5,...,0,0,0,0,0,0,0,0,0,0
5168,35,27,11,2,6,5,151,4,3,23,...,0,0,0,0,0,0,0,1,0,0
5169,0,0,1,1,0,0,11,0,0,1,...,0,0,0,0,0,0,0,0,0,1
5170,2,7,1,0,2,1,28,2,0,8,...,0,0,0,0,0,0,0,1,0,1


In [None]:
# Distribution of classes in dataset
dataset['Prediction'].value_counts()

Prediction
0    3672
1    1500
Name: count, dtype: int64

Preprocessing Data

In [None]:
# Only one categorical feature 'class', which is already label encoded, so no categorical preprocessing needed.

# Define the columns to scale
columns_to_scale = dataset.columns[:-1]  # Exclude the last column

# Scale numerical columns using MinMax
scaler = MinMaxScaler()
for column in columns_to_scale:
    dataset[column] = scaler.fit_transform(dataset[[column]])

# Assign all features except the last feature ('class') to X and make the Target Variable Y equal to 'class'
X = dataset.iloc[:, :-1].values
y = dataset['Prediction'].values

Train-Test Data Split

In [None]:
# Split the large dataset into by 60% Training and 40% Testing (can be adjusted by changing the value for test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

Deep Autoencoder Architecture + Training

In [None]:
# Autoencoder consisting of 5 hidden layers and 1 input/output layer
# Used for feature learning of the dataset (can be repurposed for other datasets)

# Number of features in the input data (3002 - 1 ('Present') - 1 ('Email No.') = 3000 total features)
n_inputs = 3000

# Define the input layer
visible = Input(shape=(n_inputs,))

# Hidden layers with increased capacity and regularization
e = Dense(1500, activation='relu')(visible) # Hidden Layer 1
#e = Dropout(0.1)(e)
#e = BatchNormalization()(e)

e = Dense(1000, activation='selu')(e) # Hidden Layer 2

bottleneck = Dense(500, activation='relu')(e) # Hidden Layer 3 (Latent Space)
#bottleneck = Dropout(0.1)(e)
#bottlneck = BatchNormalization()(e)

d = Dense(1000, activation='selu')(bottleneck) # Hidden Layer 4

d = Dense(1500, activation='relu')(d) # Hidden Layer 5
#d = Dropout(0.1)(e)
#d = BatchNormalization()(e)

# Output Layer
output = Dense(3000, activation='linear')(d)

# Define the model
model = Model(inputs=visible, outputs=output)

# Compile the model with ReduceLROnPlateau callback
model.compile(optimizer='adam', loss='mse')

# Set up early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.0001)

# Fit the model with augmented data
history = model.fit(X_train, X_train, epochs=4, batch_size=32, verbose=2,
                    validation_split=0.1, callbacks=[early_stopping, reduce_lr])
# Define a deep network model
neural_network = Model(inputs=visible, outputs=output)
plot_model(neural_network, 'autoencoder.png', show_shapes=True)

# Save the neural_network model in Keras format
neural_network.save('autoencoder_model.keras')

Epoch 1/4
88/88 - 17s - loss: 0.0017 - val_loss: 0.0020 - lr: 0.0010 - 17s/epoch - 193ms/step
Epoch 2/4
88/88 - 14s - loss: 0.0017 - val_loss: 0.0018 - lr: 0.0010 - 14s/epoch - 160ms/step
Epoch 3/4
88/88 - 14s - loss: 0.0017 - val_loss: 0.0017 - lr: 0.0010 - 14s/epoch - 159ms/step
Epoch 4/4
88/88 - 14s - loss: 0.0017 - val_loss: 0.0018 - lr: 0.0010 - 14s/epoch - 159ms/step


In [None]:
# Preprocessing & Autoencoder have been applied prior to training
# Random Forest ML Ensemble Algorithm for Binary Classification

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score
from keras.models import load_model

# Load the model from file
encoder = load_model('autoencoder_model.keras')

# Encode the training and testing data
X_train_encoded = encoder.predict(X_train)
X_test_encoded = encoder.predict(X_test)

# Create a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=50, random_state=42)

# Perform k-fold cross-validation (k = 10)
cv_scores = cross_val_score(clf, X_train_encoded, y_train, cv=2)

# Train the model on the entire training set
clf.fit(X_train_encoded, y_train)

# Evaluate the model on the test set
y_pred = clf.predict(X_test_encoded)

# Print the cross-validation scores
print("Cross-Validation Scores:")
print(cv_scores)
print("\nMean CV Score:", cv_scores.mean(),"\n")

# Print classification report and confusion matrix on the test set
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred), "\n")

# Calculate AUC
y_prob = clf.predict_proba(X_test)[:, 1]  # Get the probability of the malware class
auc_score = roc_auc_score(y_test, y_prob)
print("AUC Score:", auc_score)

# Calculate MCC
mcc_score = matthews_corrcoef(y_test, y_pred)
print("MCC Score:", mcc_score)

Cross-Validation Scores:
[0.77770619 0.76337847]

Mean CV Score: 0.7705423255365678 

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1459
           1       0.70      0.53      0.60       610

    accuracy                           0.80      2069
   macro avg       0.76      0.72      0.73      2069
weighted avg       0.79      0.80      0.79      2069

Confusion Matrix:
[[1321  138]
 [ 286  324]] 

AUC Score: 0.4647170192923516
MCC Score: 0.47798005530466897
