# Project 2

#### Read data for easy_ham, hard_ham and spam of subject line

In [1]:
import os
import pandas as pd

# Read emails from each folder
def read_subjects_from_folder(folder_path):
    subjects = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as file:
            for line in file:
                if line.startswith("Subject:"):
                    subjects.append(line.strip())
                    break
    return subjects

easy_ham_emails = read_subjects_from_folder('./easy_ham')
hard_ham_emails = read_subjects_from_folder('./hard_ham')
spam_emails = read_subjects_from_folder('./spam')

#### Split data into training and testing sets

In [2]:
def split_data(emails):
    train_emails = []
    test_emails = []
    for i, email in enumerate(emails):
        if (i + 1) % 4 == 0:
            test_emails.append(email)
        else:
            train_emails.append(email)
    return train_emails, test_emails

easy_ham_train, easy_ham_test = split_data(easy_ham_emails)
hard_ham_train, hard_ham_test = split_data(hard_ham_emails)
spam_train, spam_test = split_data(spam_emails)

# Create dataframes for training and testing sets
train_df = pd.DataFrame({
    'Email': easy_ham_train + hard_ham_train + spam_train,
    'Label': ['ham'] * len(easy_ham_train + hard_ham_train) + ['spam'] * len(spam_train)
})

test_df = pd.DataFrame({
    'Email': easy_ham_test + hard_ham_test + spam_test,
    'Label': ['ham'] * len(easy_ham_test + hard_ham_test) + ['spam'] * len(spam_test)
})


#### Build a spam filter

In [3]:
import re

excluded_words = {'the', 'subject'}

# Extract words from the training data and count their occurrences
training_spam = {}
training_ham = {}
training_total = {}

total_spam = 0
total_ham = 0
total = 0

# Add the words found into the ham and spam data
for index, email in train_df.iterrows():
    words = re.findall(r'\b[a-zA-Z]+\b', email['Email'].lower())
    for word in words:
        if word in excluded_words:
            continue
        if email['Label'] == 'ham':
            if word in training_ham:
                training_ham[word] += 1
            else:
                training_ham[word] = 1    
            total_ham += 1
        else:
            if word in training_spam:
                training_spam[word] += 1
            else:
                training_spam[word] = 1
            total_spam += 1
        total += 1


# Sort the dictionaries
training_spam = dict(sorted(training_spam.items(), key=lambda item: item[1], reverse=True))
training_ham = dict(sorted(training_ham.items(), key=lambda item: item[1], reverse=True))

#### 5 words with highest spam and ham proabilities

In [4]:
spam_probabilities = {}
ham_probabilities = {}
for word in training_spam:
    spam_probabilities[word] = (training_spam[word] + 1) / (total_spam + 2)
for word in training_ham:
    ham_probabilities[word] = (training_ham[word] + 1) / (total_ham + 2)
    
top_5_ham = sorted(ham_probabilities.items(), key=lambda item: item[1], reverse=True)[:5]
top_5_spam = sorted(spam_probabilities.items(), key=lambda item: item[1], reverse=True)[:5]

print("5 Most Hammiest Words:", top_5_ham)
print("5 Most Spammiest Words:", top_5_spam)

5 Most Hammiest Words: [('re', 0.09011970999831394), ('for', 0.01930534479851627), ('to', 0.01424717585567358), ('of', 0.012898330804248861), ('a', 0.012223908278536503)]
5 Most Spammiest Words: [('your', 0.025185185185185185), ('you', 0.020246913580246915), ('for', 0.019753086419753086), ('ilug', 0.018765432098765432), ('a', 0.017777777777777778)]


#### Setup spam calculation function

In [5]:
def compute_spam(words, spam, ham):
    spam_probabilities = 0
    ham_probabilities = 0
    
    for word in words:
        # Skip excluded words
        if word in excluded_words:
            continue
        # Add all probabilities of each word in the email subject
        if word in training_spam:
            # Compute probabilities with smoothing
            spam_probabilities += (training_spam[word] + 1) / (total_spam + 2)
        if word in training_ham:
            ham_probabilities += (training_ham[word] + 1) / (total_ham + 2)
    
    if (spam_probabilities * spam + ham_probabilities * ham) > 0:
        return (spam_probabilities * spam) / (spam_probabilities * spam + ham_probabilities * ham)
    
    return 1

In [6]:
correct_predicted = 0
predicted = 0
correct_predicted_spam = 0
predicted_spam = 0
spam_count = 0

for index, email in test_df.iterrows():
    words = re.findall(r'\b[a-zA-Z]+\b', email['Email'].lower())
    if total_ham > 0 and total_spam > 0 and total > 0:
        probability_of_spam = compute_spam(words, total_spam / total, total_ham / total)

    if probability_of_spam > 0.5:
        predicted_spam += 1
        if email['Label'] == 'spam':
            correct_predicted += 1
            correct_predicted_spam += 1
    else:
        if email['Label'] == 'ham':
            correct_predicted += 1    
    if email['Label'] == 'spam':
        spam_count += 1

    # Add to filter
    for word in words:
        if word in excluded_words:
            continue
        if email['Label'] == 'ham':
            if word in training_ham:
                training_ham[word] += 1
            else:
                training_ham[word] = 1    
            total_ham += 1
        else:
            if word in training_spam:
                training_spam[word] += 1
            else:
                training_spam[word] = 1
            total_spam += 1
        total += 1
    predicted += 1
    
print("Accuracy:", correct_predicted / predicted)
print("Precision:", correct_predicted_spam / predicted_spam)
print("Recall:", correct_predicted_spam / spam_count)

Accuracy: 0.8822815533980582
Precision: 0.7413793103448276
Recall: 0.344
