In [33]:
import zipfile
import os
import numpy as np
import pandas as pd
from email import parser
from collections import Counter
from sklearn.model_selection import train_test_split
import re

# Path to the uploaded zip file
dataset_zip_path = '/content/trec06p-cs280.zip'  # Update this path based on upload
dataset_extraction_dir = '/content/trec06p-cs280'

# Unzip the dataset
with zipfile.ZipFile(dataset_zip_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_extraction_dir)

# Verify the extraction
print(os.listdir(dataset_extraction_dir))


['__MACOSX', 'trec06p-cs280']


In [34]:
# Mount Google Drive (optional if storing large datasets)
from google.colab import drive
drive.mount('/content/drive')

# Set paths for dataset and stop words
DATASET_PATH = '/content/trec06p-cs280'  # Change if you upload it elsewhere
STOP_WORDS_PATH = '/content/stop_words.txt'

# Load stop words
with open(STOP_WORDS_PATH, 'r') as f:
    stop_words = set(f.read().splitlines())

# Helper function to clean text
def clean_email(email_text):
    # Remove special characters, numbers, and extra spaces
    email_text = re.sub(r'[^\w\s]', '', email_text)
    email_text = re.sub(r'\d+', '', email_text)
    # Tokenize and remove stop words
    tokens = email_text.lower().split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Explore dataset folder structure
os.listdir(DATASET_PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['__MACOSX', 'trec06p-cs280']

In [35]:
# Define the path to the dataset
dataset_extraction_dir = '/content/trec06p-cs280'
# Path to the dataset folder
trec06p_dir = '/content/trec06p-cs280/trec06p-cs280'

# List the contents of the extracted dataset directory
print(os.listdir(dataset_extraction_dir))

# # Check for the `full` directory
# if 'full' in os.listdir(dataset_extraction_dir):
#     print("Contents of 'full' directory:")
#     print(os.listdir(os.path.join(dataset_extraction_dir, 'full')))
# else:
#     print("'full' directory not found.")

# List the contents of the main dataset folder
print(os.listdir(trec06p_dir))

# # Recursively explore the dataset folder
# for root, dirs, files in os.walk(trec06p_dir):
#     print(f"Directory: {root}")
#     print(f"Files: {files}")
#     print(f"Subdirectories: {dirs}")
#     print("-" * 40)

labels_file = '/content/trec06p-cs280/trec06p-cs280/labels'

# Check the content of the labels file
with open(labels_file, 'r') as f:
    for _ in range(5):  # Print the first 5 lines
        print(f.readline())


['__MACOSX', 'trec06p-cs280']
['labels', '.DS_Store', 'README.rtf', 'data']
ham ../data/000/000

spam ../data/000/001

spam ../data/000/002

ham ../data/000/003

spam ../data/000/004



In [36]:
email_data = []
data_dir = '/content/trec06p-cs280/trec06p-cs280/data'

with open(labels_file, 'r') as f:
    for line in f:
        label, relative_path = line.strip().split(' ')
        label = 1 if label == 'spam' else 0  # 1 for spam, 0 for ham
        full_path = os.path.join(data_dir, relative_path)
        email_data.append((label, full_path))

# Check the first few entries
print(email_data[:5])


[(0, '/content/trec06p-cs280/trec06p-cs280/data/../data/000/000'), (1, '/content/trec06p-cs280/trec06p-cs280/data/../data/000/001'), (1, '/content/trec06p-cs280/trec06p-cs280/data/../data/000/002'), (0, '/content/trec06p-cs280/trec06p-cs280/data/../data/000/003'), (1, '/content/trec06p-cs280/trec06p-cs280/data/../data/000/004')]


In [64]:
def clean_email(email_text, remove_stopwords=False):
    """Clean email text by removing punctuation, numbers, and optionally stop words."""
    # Remove special characters and numbers
    email_text = re.sub(r'[^\w\s]', '', email_text)  # Remove punctuation
    email_text = re.sub(r'\d+', '', email_text)      # Remove numbers
    # Tokenize the text
    tokens = email_text.lower().split()
    # Remove stop words if flag is True
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Modified function: does not remove stop words
def clean_email_no_stopwords(email_text):
    # Remove special characters, numbers, and extra spaces
    email_text = re.sub(r'[^\\w\\s]', '', email_text)
    email_text = re.sub(r'\\d+', '', email_text)
    # Tokenize without removing stop words
    tokens = email_text.lower().split()
    return tokens

In [65]:
# Parse emails
emails = []
labels = []
for label, path in email_data:
    try:
        with open(path, 'r', errors='ignore') as f:
            msg = parser.Parser().parse(f)

            # Check if the email has a payload
            if msg.is_multipart():
                email_text = ' '.join(
                    part.get_payload(decode=True).decode(errors='ignore') if part.get_payload(decode=True) else ''
                    for part in msg.get_payload()
                )
            else:
                # Handle non-multipart emails
                email_text = msg.get_payload(decode=True)
                email_text = email_text.decode(errors='ignore') if email_text else ''

            # Only clean and store non-empty emails
            if email_text:
                emails.append(clean_email(email_text))
                labels.append(label)
            else:
                print(f"Skipping empty email at {path}")

    except Exception as e:
        print(f"Error reading email at {path}: {e}")

# Check a sample
print(emails[:1], labels[:1])

Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/002/136
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/003/273
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/008/093
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/008/192
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/010/046
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/010/070
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/018/229
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/019/268
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/021/062
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/022/013
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/024/087
Skipping empty email at /content/trec06p-cs280/trec06p-cs280/data/../data/025/218
Skipping empty e

In [95]:
# Flatten the list of tokenized emails and count word frequencies
word_counts = Counter(word for email in emails for word in email)

#Extract the 10,000 most common words
common_words = [word for word, count in word_counts.most_common(1000)]

# # Extract words with count > k (e.g., k = 100)
# k = 100
# common_words = [word for word, count in word_counts.items() if count > k]

print(f"Top 10 words: {common_words[:1000]}")
print(f"Vocabulary size: {len(common_words)}")


Top 10 words: ['the', 'to', 'a', 'and', 'of', 'div', 'in', 'i', 'is', 'for', 'you', 'td', 'that', 'px', 'this', 'br', 'with', 'it', 'from', 'on', 'tr', 'be', 'width', 'size', 'by', 'are', 'strong', 'have', 'as', 'at', 'your', 'p', 'or', 'not', 'font', 'bb', 'we', 'will', 'html', 'our', 'if', 'facearial', 'body', 'but', 'an', 'my', 'can', 'all', 'border', 'float', 'was', 'no', 'c', 'has', 'meta', 'more', 'v', 'its', 'right', 'r', 'height', 'one', 'table', 'x', 'head', 'n', 'any', 'color', 'l', 'do', 'span', 'e', 'they', 'me', 's', 'about', 'new', 'would', 'img', 'there', 'out', 'so', 'get', 't', 'styleborder', 'when', 'left', 'price', 'some', 'm', 'which', 'up', 'divnbspdiv', 'b', 'what', 'u', 'may', 'divfont', 'email', 'cellspacing', 'o', 'been', 'use', 'now', 'like', 'g', 'd', 'only', 'received', 'also', 'id', 'other', 'he', 'please', 'am', 'product_table', 'httpequivcontenttype', 'time', 'us', 'style', 'list', 'cellpadding', 'company', 'contenttexthtml', 'subject', 'f', 'ms', 'info',

In [96]:
def create_feature_matrix(emails, vocab):
    """
    Create a binary feature matrix where each row represents an email
    and each column represents a word in the vocabulary.
    """
    feature_matrix = np.zeros((len(emails), len(vocab)), dtype=np.int8)
    word_to_index = {word: idx for idx, word in enumerate(vocab)}

    for i, email in enumerate(emails):
        for word in email:
            if word in word_to_index:
                feature_matrix[i, word_to_index[word]] = 1

    return feature_matrix

# Create the feature matrix
feature_matrix = create_feature_matrix(emails, common_words)
print(f"Feature matrix shape: {feature_matrix.shape}")


Feature matrix shape: (37553, 1000)


In [97]:
# Split into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.3, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (26287, 1000)
Testing set size: (11266, 1000)


In [110]:
class NaiveBayesSpamFilter:
    def __init__(self, alpha=0.005):
        self.alpha = alpha  # Laplace smoothing factor
        self.prob_spam = None
        self.prob_ham = None
        self.spam_word_probs = None
        self.ham_word_probs = None

    def train(self, X, y):
        # Separate spam and ham emails
        spam_emails = X[y == 1]
        ham_emails = X[y == 0]

        # Calculate prior probabilities
        self.prob_spam = len(spam_emails) / len(y)
        self.prob_ham = len(ham_emails) / len(y)

        # Calculate likelihoods with Laplace smoothing
        self.spam_word_probs = (spam_emails.sum(axis=0) + self.alpha) / (spam_emails.sum() + self.alpha * X.shape[1])
        self.ham_word_probs = (ham_emails.sum(axis=0) + self.alpha) / (ham_emails.sum() + self.alpha * X.shape[1])

    def predict(self, X):
        # Calculate log probabilities for numerical stability
        spam_scores = (X @ np.log(self.spam_word_probs)) + np.log(self.prob_spam)
        ham_scores = (X @ np.log(self.ham_word_probs)) + np.log(self.prob_ham)

        # Return 1 if spam score is higher, else 0
        return (spam_scores > ham_scores).astype(int)


In [111]:
# Convert lists to NumPy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Initialize and train the classifier
nb_classifier = NaiveBayesSpamFilter(alpha=1.0)
nb_classifier.train(X_train, y_train)


In [112]:
y_pred = nb_classifier.predict(X_test)

def calculate_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def calculate_precision(y_true, y_pred):
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    false_positives = np.sum((y_pred == 1) & (y_true == 0))
    return true_positives / (true_positives + false_positives)

def calculate_recall(y_true, y_pred):
    true_positives = np.sum((y_pred == 1) & (y_true == 1))
    false_negatives = np.sum((y_pred == 0) & (y_true == 1))
    return true_positives / (true_positives + false_negatives)

# Compute metrics
accuracy = calculate_accuracy(y_test, y_pred)
precision = calculate_precision(y_test, y_pred)
recall = calculate_recall(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Accuracy: 0.8960
Precision: 0.9813
Recall: 0.8594


What is the effect of removing stop words in terms of precision, recall, and
accuracy?  Show a plot or a table of these results.

Removing stop_words increased the accuracy, precision, and recall of the model. The table below will show the difference of each evaluation metrics value if the model is trained with and without the stop words.

Removed stop words:
Accuracy: 0.9490
Precision: 0.9927
Recall: 0.9298

Stop words not removed:
Accuracy: 0.9429
Precision: 0.9938
Recall: 0.9197

Experiment on the number of words used for training.  Filter the dictionary to
include only words occurring more than k times (1000 words, then k > 100, and k
= 50 times).  For example, the word “offer” appears 150 times, that means that it
will be included in the dictionary.

For 1000 words:
Accuracy: 0.9490
Precision: 0.9927
Recall: 0.9298

For K>100:
Accuracy: 0.9423
Precision: 0.9933
Recall: 0.9191

For k = 50:
Accuracy: 0.8061
Precision: 0.9469
Recall: 0.7496

This means that when we use the most common words for our model, it significantly increases its accuracy, precision, and recall for prediction.

Discuss the results of the different parameters used for Lambda smoothing.  Test
it on 5 varying values of the λ (e.g. λ = 2.0, 1.0, 0.5, 0.1, 0.005),  Evaluate
performance metrics for each.

λ = 2.0: Accuracy: 0.8960 Precision: 0.9813 Recall: 0.8594

λ = 1.0: Accuracy: 0.9490 Precision: 0.9927 Recall: 0.9298

λ = 0.5: Accuracy: 0.8960 Precision: 0.9813 Recall: 0.8594

λ = 0.1: Accuracy: 0.8960 Precision: 0.9813 Recall: 0.8594

λ = 0.005: Accuracy: 0.8960 recision: 0.9813 Recall: 0.8594

Lambda is significant to apply laplace smoothing in my naive bayes model. Based on the table above, the best value for lambda is 1.0 as it acquires the highes prrecision, recall, and accuracy.



What are your recommendations to further improve the model?

Currrently, we are using binary word presence to get the weight of each word in predicting spam or ham. Maybe next time we can employ TF-IDF to know the weight of each word based on their importance.