In [101]:
import tarfile
import os
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

I am going to try building a model with just a subset of the data. Then I will build the entire model.
This is so I can first make sure I have everything working properly before I take all the computational
power to train the model on a large dataset.

# Data Preprocessing

In [21]:
'''
I'm extracting the tar file and
grabbing the contents of everything I've extracted.
We also need to make sure the extraction file exists

'''
tar_path = 'pu_corpora_public.tar'
extract_to = 'pu_corpora_public'

if not os.path.exists(extract_to):
    os.makedirs(extract_to)

with tarfile.open(tar_path) as tar:
    tar.extractall(path=extract_to)

extracted_contents = os.listdir(extract_to)
extracted_contents

['pu_corpora_public', '.DS_Store']

In [23]:
'''

Now I am grabbing all the files
of the dataset + readme and listing them 
'''

subdirectory_path = os.path.join(extract_to, 'pu_corpora_public')
subdirectory_contents = os.listdir(subdirectory_path)
subdirectory_contents

['pu1', 'pua', 'readme.txt', 'pu2', 'pu3']

In [25]:
'''
Now I am examining the readme file

'''
readme_path = os.path.join(subdirectory_path, 'readme.txt')

with open(readme_path, 'r') as file:
    readme_contents = file.read()

print(readme_contents[:500])

This directory contains the PU1, PU2, PU3, and PUA corpora, as 
described in the paper:

I. Androutsopoulos, G. Paliouras, E. Michelakis, "Learning to 
Filter Unsolicited Commercial E-Mail", submitted for journal 
publication, 2003.

There are 4 directories (pu1, pu2, pu3, pua), each containing
one of the four corpora. 

Each one of the 4 directories in turn contains 11 subdirectories 
(part1, ..., part10, unused). These correspond to the 10 partitions 
of each corpus that were used in the 10-fo


In [29]:
'''
Now lets a look at one of the subdirectories
in on of the corpora to see what the data
formatting looks like

Notice that emails that are not spam are considered "legit"
'''


sample_corpus_path = os.path.join(subdirectory_path, 'pu1', 'part1')


sample_corpus_contents = os.listdir(sample_corpus_path)
sample_corpus_contents[:10]

['1988legit13.txt',
 '1221legit54.txt',
 '1198legit14.txt',
 '1394spmsg90.txt',
 '1110legit57.txt',
 '1545legit49.txt',
 '177spmsg68.txt',
 '1716legit2.txt',
 '1926spmsg88.txt',
 '1489legit32.txt']

In [30]:
'''
I'm going to take a look at one of the text files.
I'm writing a function to read these and better examine them

Interesting finds here:

Notice that everything is already tokenized.
This dataset has already been pre-processed to some extent

'''

def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()
sample_file_path = os.path.join(sample_corpus_path, sample_corpus_contents[0])
sample_file_content = load_text_file(sample_file_path)
print(sample_file_content[:500])

Subject: 5573 47

3677 22660 15981 9594 5573 2130 16502 22064 15981 9594 84 19054 9594 16893 7913 1613 16502 8615 3617 1991 3677 22660 80 4695 19054 12995 8890 84 16502 9594 7602 6217 1967 16502 17157 1991 16502 15779 20385 15981 9594 80 14911 897 16993 19283 18957 3617 80 14338 180 631 1967 180 2521 14766 15820 4978 22328 84 19054 15981 9594 1847 17912 2130 1594 180 18316 20215 23479 14338 7088 6485 1835 3677 22660 84 19889 1967 19410 18065 1594 23772 2130 51 22064 132 1672 84 7423 84 14000 51 


Here is my pre-processing plan:

Since subject is in every email, we can remove the word "subject" from each line.

The text is already tokenized but I think
we need to convert text to sequences of tokens or embeddings
when working from raw text like this

Lastly:

We can determine the labels on the emails (spam/not spam) based on the file-name.


In [34]:
'''
this function will determine if a file is spam based on its filename
'''
def is_spam(filename):
    return 'spmsg' in filename


In [35]:
'''
This function will load files from a directory
then return a DataFrame with content and labels

'''

def load_data_from_directory(directory_path, sample_size_per_category=100):
    files = os.listdir(directory_path)
    contents = []
    labels = []
    
    spam_count = 0
    legit_count = 0
    
    for file in files:
        if spam_count >= sample_size_per_category and legit_count >= sample_size_per_category:
            break
        
        file_path = os.path.join(directory_path, file)
        content = load_text_file(file_path)
        
        content = re.sub(r'Subject:.*\n?', '', content, count=1)
        
        if is_spam(file) and spam_count < sample_size_per_category:
            contents.append(content)
            labels.append(1)
            spam_count += 1
        elif not is_spam(file) and legit_count < sample_size_per_category:
            contents.append(content)
            labels.append(0)
            legit_count += 1
    
    return pd.DataFrame({'content': contents, 'label': labels})

In [36]:
# Loading a balanced sample dataset
sample_data = load_data_from_directory(sample_corpus_path)
sample_data.head()

Unnamed: 0,content,label
0,\n3677 22660 15981 9594 5573 2130 16502 22064 ...,0
1,\n1791 13383 80 8962 2130 15184 17345 9131 217...,0
2,\n1835 23758 17345 16531 16502 7634 17753 2040...,0
3,\n22180 11245 14338 2649 13406 1124 47 47 47 4...,1
4,\n82 82 82 82 82 82 82 82 82 82 82 82 82 82 82...,0


In [92]:
sample_data.shape

(109, 2)


Now we want to split the data into training and test sets.

There is a problem of unequal shapes of the sequences when trying
to convert the text data directly into numpy arrays.
This is because the emails (now sequences of numbers) are unequal in length,
which is a common issue in text data processing.

We can fix this by padding the sequences so they have a uniform length.
This is a requirment for feeding them into most neural networks.
Let's find a good sequence length by looking at
the distribution of lengths in our dataset
and then we can pad the sequences.

In [56]:
'''
I'm splitting the dataset into training and validation sets,
then we convert the sets into numpy arrays for tensorflow/

'''

X_train, X_val, y_train, y_val = train_test_split(sample_data['content'], sample_data['label'], test_size=0.2, random_state=42)


X_train = np.array([np.fromstring(x, sep=' ') for x in X_train])
X_val = np.array([np.fromstring(x, sep=' ') for x in X_val])

X_train.shape, X_val.shape

  X_train = np.array([np.fromstring(x, sep=' ') for x in X_train])
  X_val = np.array([np.fromstring(x, sep=' ') for x in X_val])


((87,), (22,))

In [57]:
'''
Getting the sequence lengths then figuring out a reasonable padding length
I'm printing out the basic stats to help figure it out.

'''

sequence_lengths = [len(np.fromstring(x, sep=' ')) for x in sample_data['content']]

sequence_length_stats = {
    'mean': np.mean(sequence_lengths),
    'median': np.median(sequence_lengths),
    'max': np.max(sequence_lengths),
    'min': np.min(sequence_lengths),
    'std': np.std(sequence_lengths)
}

sequence_length_stats

{'mean': 920.183486238532,
 'median': 453.0,
 'max': 12862,
 'min': 10,
 'std': 1596.6688153096563}

In [58]:
'''
I'm going to set our sequence lengths to 500.
This is slightly above the median

'''

sequence_length = 500

X_train_padded = pad_sequences(X_train, maxlen=sequence_length, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val, maxlen=sequence_length, padding='post', truncating='post')

X_train_padded.shape, X_val_padded.shape

((87, 500), (22, 500))

In [79]:
'''
Here are the model parameters
I am adjusting this based on the
dataset's vocabulary and
the size of the vectors.

'''

vocab_size = 20000
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(sequence_length,)),
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

  super().__init__(**kwargs)


In [80]:
'''
Training the model on padded data

'''

history = model.fit(X_train_padded, y_train, epochs=10, validation_data=(X_val_padded, y_val))

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 160ms/step - accuracy: 0.4312 - loss: 0.6942 - val_accuracy: 0.4545 - val_loss: 0.6932
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.6091 - loss: 0.6880 - val_accuracy: 0.6364 - val_loss: 0.6849
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.6867 - loss: 0.6782 - val_accuracy: 0.6364 - val_loss: 0.6792
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.6656 - loss: 0.6637 - val_accuracy: 0.6364 - val_loss: 0.6716
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.6617 - loss: 0.6487 - val_accuracy: 0.6364 - val_loss: 0.6637
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step - accuracy: 0.7081 - loss: 0.6241 - val_accuracy: 0.6364 - val_loss: 0.6553
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━

In [82]:
'''
evaluating the model

'''

loss, accuracy = model.evaluate(X_val_padded, y_val)
print(f"Validation Accuracy: {accuracy*100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 0.4545 - loss: 0.8726
Validation Accuracy: 45.45%


This is model above just a sample of the data. There really isn't enough data to build a reasonable model by just
using a sample. So now I am going to use the entire dataset.

In [84]:
'''
This is a function written in order to
load all of the data within my dataset
instead of just a sample
'''

def load_all_data(directory_path):
    all_contents = []
    all_labels = []

    for corpus_dir in ['pu1', 'pu2', 'pu3', 'pua']:
        corpus_path = os.path.join(directory_path, corpus_dir)

        for part in os.listdir(corpus_path):
            part_path = os.path.join(corpus_path, part)
            
            if not os.path.isdir(part_path):
                continue
            
            for file in os.listdir(part_path):
                file_path = os.path.join(part_path, file)
                content = load_text_file(file_path)
                
                content = re.sub(r'Subject:.*\n?', '', content, count=1)

                all_contents.append(content)
                all_labels.append(1 if 'spmsg' in file else 0)


    return pd.DataFrame({'content': all_contents, 'label': all_labels})


entire_dataset = load_all_data(subdirectory_path)
entire_dataset.head(), entire_dataset.shape

(                                             content  label
 0  \n284 6818 80 284 13383 80 127 93 84 489 18798...      0
 1  \n16502 21946 7634 16893 15149 1613 16538 80 1...      0
 2  \n14338 7488 2221 20439 103 80 12116 18469 187...      0
 3  \n4822 80 16502 2410 1967 180 262 1847 12146 1...      1
 4  \n478 17188 130 20259 12808 80 196 17054 1812 ...      1,
 (7101, 2))

In [85]:
'''
Building the numpy arrays for the entire dataset

'''

X = np.array([np.fromstring(text, sep=' ') for text in entire_dataset['content']])
y = entire_dataset['label'].values

  X = np.array([np.fromstring(text, sep=' ') for text in entire_dataset['content']])


In [86]:
sequence_lengths = [len(np.fromstring(x, sep=' ')) for x in entire_dataset['content']]

sequence_length_stats = {
    'mean': np.mean(sequence_lengths),
    'median': np.median(sequence_lengths),
    'max': np.max(sequence_lengths),
    'min': np.min(sequence_lengths),
    'std': np.std(sequence_lengths)
}

sequence_length_stats

{'mean': 673.6744120546402,
 'median': 290.0,
 'max': 134874,
 'min': 1,
 'std': 2313.5984150545423}

In [88]:
'''
Using padding the ensure uniform length
Using 300, since that is slightly above the median
'''
sequence_length = 300
X_padded = pad_sequences(X, maxlen=sequence_length, padding='post', truncating='post')

In [89]:
'''
Building the
train/test split

'''

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [96]:
'''
Building the model

'''




model = Sequential([
    Embedding(input_dim=20000, output_dim=64, input_shape=(sequence_length,)),  # Adjust 'input_dim' as necessary
    LSTM(64, return_sequences=False),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.6088 - loss: 0.6588 - val_accuracy: 0.8184 - val_loss: 0.4613
Epoch 2/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.8351 - loss: 0.4329 - val_accuracy: 0.8571 - val_loss: 0.4146
Epoch 3/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.8246 - loss: 0.4350 - val_accuracy: 0.6918 - val_loss: 0.5682
Epoch 4/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - accuracy: 0.6640 - loss: 0.5998 - val_accuracy: 0.7178 - val_loss: 0.5396
Epoch 5/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.7220 - loss: 0.5121 - val_accuracy: 0.7305 - val_loss: 0.5025
Epoch 6/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - accuracy: 0.7529 - loss: 0.4291 - val_accuracy: 0.7277 - val_loss: 0.5139
Epoch 7/10
[1m1

In [98]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy*100:.2f}%")

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9313 - loss: 0.2403
Validation Accuracy: 93.10%


# Interpretation and Next Steps

 My model is showing initial strong performance. We can see this by the high accuracy and low loss on the validation set. 
 
Something I need to be careful about is potential overfitting.The model might perform better on the training data than on the validation data. We can add some techniques like dropout layers to detect this.

I'm going to try some optimization techniques now. The lowest hanging fruit here is Early Stopping.

Something else to consider: My input dimensions variable was set to 20,000. This means that the corpus of words
only contains 20,000 total different words (tokens). Considering there are 171,000 words in the english dictionary,
this might be worth looking into as something I could adjust.

In [100]:
'''
Implementing early stopping
and training the model as such
'''
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)


history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val), callbacks=[early_stopping_callback])

Epoch 1/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 70ms/step - accuracy: 0.9521 - loss: 0.1715 - val_accuracy: 0.9170 - val_loss: 0.2744
Epoch 2/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 69ms/step - accuracy: 0.9555 - loss: 0.1534 - val_accuracy: 0.9310 - val_loss: 0.2591
Epoch 3/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 69ms/step - accuracy: 0.9669 - loss: 0.1270 - val_accuracy: 0.9092 - val_loss: 0.2909
Epoch 4/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.9321 - loss: 0.2045 - val_accuracy: 0.9071 - val_loss: 0.2962
Epoch 5/10
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 75ms/step - accuracy: 0.9701 - loss: 0.1349 - val_accuracy: 0.9134 - val_loss: 0.2703
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.


Really interesting here. We have a low amount of Epochs (10) but we are not even hitting ten iterations of the model. I have a low patience parameter (3) but I think adjusting Epoch/Patience is not the best move here. I am going to implement something a little different. I am going to try a regularization method to solve the problem of overfitting that we are finding here. I think 97% accuracy is solid but I want this on the test set as well

In [102]:
'''

Here is the model with dropout

'''

vocab_size = 20000
embedding_dim = 64
sequence_length = 300
dropout_rate = 0.5

model = Sequential([
    Input(shape=(sequence_length,)),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    LSTM(64, return_sequences=False),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [103]:
'''
Dropout Model with Early Stopping, I am going to drop an epoch of 30,
but I doubt it will reach the 30th iteration.

'''


early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


history = model.fit(
    X_train, y_train, 
    epochs=30,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

Epoch 1/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 70ms/step - accuracy: 0.5865 - loss: 0.6642 - val_accuracy: 0.6819 - val_loss: 0.6417
Epoch 2/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 70ms/step - accuracy: 0.6346 - loss: 0.6397 - val_accuracy: 0.6833 - val_loss: 0.5938
Epoch 3/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.6792 - loss: 0.5794 - val_accuracy: 0.7150 - val_loss: 0.5288
Epoch 4/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 70ms/step - accuracy: 0.7545 - loss: 0.4465 - val_accuracy: 0.7403 - val_loss: 0.5120
Epoch 5/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.7543 - loss: 0.4230 - val_accuracy: 0.7368 - val_loss: 0.4847
Epoch 6/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 71ms/step - accuracy: 0.8094 - loss: 0.3394 - val_accuracy: 0.8888 - val_loss: 0.4714
Epoch 7/30
[1m1

I'm at 91% accuracy on the validation set and my overall loss is on the down-trend. This is okay and honestly, I think the best way to improve the accuracy is going to be by getting more clean data, but we don't really have the time or resources to do all of that. Regardless, I'd like to try adjusting my learning rate as a last ditch effort to improve my model.

In [105]:
'''
I am going to try adjusting the learning rate using step decay
in my model. This approach will reduce the learning rate
by a factor every few epochs.

'''

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.96,
    staircase=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

In [106]:
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [107]:
history = model.fit(
    X_train, y_train,
    epochs=30,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping]
)

Epoch 1/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 68ms/step - accuracy: 0.9294 - loss: 0.2268 - val_accuracy: 0.9050 - val_loss: 0.2950
Epoch 2/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 68ms/step - accuracy: 0.9311 - loss: 0.2199 - val_accuracy: 0.9191 - val_loss: 0.2505
Epoch 3/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 68ms/step - accuracy: 0.9452 - loss: 0.1715 - val_accuracy: 0.9163 - val_loss: 0.2688
Epoch 4/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 68ms/step - accuracy: 0.9429 - loss: 0.1884 - val_accuracy: 0.9120 - val_loss: 0.3185
Epoch 5/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 69ms/step - accuracy: 0.9444 - loss: 0.1854 - val_accuracy: 0.9331 - val_loss: 0.2383
Epoch 6/30
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 69ms/step - accuracy: 0.9555 - loss: 0.1516 - val_accuracy: 0.9268 - val_loss: 0.2502
Epoch 7/30
[1m1

Really happy with this so. 95% accuracy on the training set and then 93% accuracy on the test set. No significant signs of overfitting and I think given the quality and quanitity of our data, this is a very reasonable result.