In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import email

import os
import re

labels_dataframe = pd.read_csv("trec06p-cs280/labels", sep=" ", header=None)

labels_dataframe.columns = ["Classification", "FileLocation"]

labels_dataframe["Classification"] = labels_dataframe["Classification"].map({"spam": 1, "ham": 0})

labels_dataframe["FileLocation"] = labels_dataframe["FileLocation"].map(lambda x: x.replace("../data/", ""))

FolderLocation = "trec06p-cs280/data"

# Initialize DataFrame to store results
df = pd.DataFrame(columns=["Folder", "File", "EmailMessages", "Classification"])

folders = sorted(os.listdir(FolderLocation))
print("Folders found in the directory:")
    
for folder in folders:
    folder_path = os.path.join(FolderLocation, folder)
        
    # Check if the path is a directory
    if os.path.isdir(folder_path):
        print(f"\nProcessing folder: {folder}")
        files = sorted(os.listdir(folder_path))  # Get the files inside the folder
            
        for file in files:
            file_path = os.path.join(folder_path, file)
                
            # Read and process each email file
            with open(file_path, "r", encoding="ISO-8859-1") as email_file:
                read_email_file = email_file.read()
                parsed = email.message_from_string(read_email_file)  # Parse email

                # Function to get the email message from the parsed email
                def getMessage(parsed_email):
                    if parsed_email.is_multipart():
                        return ''.join(part.get_payload(decode=True).decode('ISO-8859-1') 
                                        for part in parsed_email.walk() 
                                        if part.get_content_type() == 'text/plain')
                    else:
                        return parsed_email.get_payload(decode=True).decode('ISO-8859-1')

                # Get the email message
                msg = getMessage(parsed)  # No cleaning applied to msg

                # Get the classification of the email based on the labels DataFrame
                # Ensure the file path matches the format in dfLabels
                labels_classification = labels_dataframe[labels_dataframe['FileLocation'] == f"{folder}/{file}"]['Classification']

                # Check if we found a classification
                if not labels_classification.empty:
                    labels_classification = labels_classification.values[0]
                else:
                    labels_classification = None  # Default value if not found

                # Concatenate the data to the main DataFrame
                df = pd.concat([df, pd.DataFrame([[folder, file, msg, labels_classification]], 
                                                    columns=["Folder", "File", "EmailMessages", "Classification"])], 
                                ignore_index=True)


df['Folder'] = pd.to_numeric(df['Folder'])

# Assigning of folder whether they are a training or a test set
train_set = df[df["Folder"]<71]
test_set = df[df["Folder"]>70]

# Train the train set to classify into ham or spam
train_ham = train_set[train_set['Classification'] == 0] # Ham
train_spam = train_set[train_set['Classification'] == 1] # Spam

from collections import Counter

# Combine email messages within the train
all_messages = pd.concat([train_ham['EmailMessages'], train_spam['EmailMessages']])

# Initialize Counters
count_ham = Counter()
count_spam = Counter()

# Count words in ham emails without cleaning
count_ham.update(" ".join(train_ham['EmailMessages']).split())

# Count words in spam emails without cleaning
count_spam.update(" ".join(train_spam['EmailMessages']).split())

# Combine unique words from both counters
unique_words = set(count_ham.keys()).union(set(count_spam.keys()))

# Create DataFrame with word counts
combined_counts = pd.DataFrame({
    'word': list(unique_words),
    'ham_count': [count_ham[word] for word in unique_words],
    'spam_count': [count_spam[word] for word in unique_words],
})

# Total Count for each word
combined_counts['total'] = combined_counts['ham_count'] + combined_counts['spam_count']

# Sort the DataFrame by total count in descending order
combined_counts = combined_counts.sort_values(by='total', ascending=False)

# Split messages into words and flatten the list
all_words = all_messages.str.cat(sep=' ').split()

# Count word occurrences
word_counts = Counter(all_words)

# Get the 10,000 most common words
vocabulary = [word for word, _ in word_counts.most_common(10000)]

# Create feature matrices for ham and spam training sets
def create_feature_matrix(dataframe, vocabulary):
    matrix = np.zeros((len(dataframe), len(vocabulary)), dtype=int)
    
    for i, message in enumerate(dataframe['EmailMessages']):
        # Split the message directly without cleaning
        message_words = message.split()
        for word in message_words:
            if word in vocabulary:
                matrix[i, vocabulary.index(word)] = 1  # Set 1 for word presence
                
    return matrix

# Create feature matrices
ham_feature_matrix = create_feature_matrix(train_ham, vocabulary)
spam_feature_matrix = create_feature_matrix(train_spam, vocabulary)

# Convert matrices to DataFrames for easier viewing
ham_matrix_df = pd.DataFrame(ham_feature_matrix, columns=vocabulary)
spam_matrix_df = pd.DataFrame(spam_feature_matrix, columns=vocabulary)

ndoc = len(ham_matrix_df) + len(spam_matrix_df)
nham = len(ham_matrix_df)
nspam = len(spam_matrix_df)

P_C_ham = nham / ndoc
P_C_spam = nspam / ndoc

def laplace_smoothing(feature_matrix_spam, feature_matrix_ham, vocabulary):
    # Initialize the probability of each word given spam and ham
    word_probabilities_spam = np.zeros(len(vocabulary))
    word_probabilities_ham = np.zeros(len(vocabulary))
    
    # Count occurrences of words in spam and ham emails
    word_count_spam = np.sum(feature_matrix_spam, axis=0)
    word_count_ham = np.sum(feature_matrix_ham, axis=0)
    
    # Total number of words in spam and ham
    total_words_spam = np.sum(word_count_spam)
    total_words_ham = np.sum(word_count_ham)
    
    # Initialize the Laplace smoothing parameter and the number of classes
    smoothing_param = 1
    num_classes = 2

    # Calculate the likelihood of each word with Laplace smoothing
    for i in range(len(vocabulary)):
        word_probabilities_spam[i] = (word_count_spam[i] + smoothing_param) / (total_words_spam + smoothing_param * num_classes)
        word_probabilities_ham[i] = (word_count_ham[i] + smoothing_param) / (total_words_ham + smoothing_param * num_classes)

    return word_probabilities_spam, word_probabilities_ham

# Apply Laplace smoothing to calculate the likelihoods of words given spam and ham
likelihood_spam, likelihood_ham = laplace_smoothing(spam_feature_matrix, ham_feature_matrix, vocabulary)


def classify_email(email, likelihood_ham, likelihood_spam, P_C_ham, P_C_spam, vocabulary):
    # Initialize the log probabilities for ham and spam
    log_probability_ham = 0
    log_probability_spam = 0
    
    # Split the email into words
    words = str(email).split()
    
    # Compute the log probabilities for each word in the email
    for word in words:
        if word in vocabulary:  # Check if the word is in the vocabulary
            index = vocabulary.index(word)
            log_probability_ham += np.log(likelihood_ham[index])
            log_probability_spam += np.log(likelihood_spam[index])
    
    # Add the log probabilities of the prior probabilities for ham and spam
    log_probability_ham += np.log(P_C_ham)
    log_probability_spam += np.log(P_C_spam)
    
    # Determine whether it is a spam or ham
    return 0 if log_probability_ham > log_probability_spam else 1

# Create a new column named predicted and its predicted spam or ham
train_set.loc[:, 'Predicted'] = train_set['EmailMessages'].apply(
    lambda email: classify_email(email, likelihood_ham, likelihood_spam, P_C_ham, P_C_spam, vocabulary)
)

# Create a new column named predicted and its predicted spam or ham
test_set.loc[:, 'Predicted'] = test_set['EmailMessages'].apply(
    lambda email: classify_email(email, likelihood_ham, likelihood_spam, P_C_ham, P_C_spam, vocabulary)
)

# Defined FP, FN, TP, and TN
TP = ((test_set['Classification'] == 1) & (test_set['Predicted'] == 1)).sum()
TN = ((test_set['Classification'] == 0) & (test_set['Predicted'] == 0)).sum()
FP = ((test_set['Classification'] == 0) & (test_set['Predicted'] == 1)).sum()
FN = ((test_set['Classification'] == 1) & (test_set['Predicted'] == 0)).sum()

# Calculate Accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Calculate Recall
recall = TP / (TP + FN)

# Calculate Precision
precision = TP / (TP + FP)

# Display results
print(f'Accuracy: {accuracy:.2%}') 
print(f'Recall: {recall:.2%}')      
print(f'Precision: {precision:.2%}') 

Folders found in the directory:

Processing folder: 000

Processing folder: 001

Processing folder: 002

Processing folder: 003

Processing folder: 004

Processing folder: 005

Processing folder: 006

Processing folder: 007

Processing folder: 008

Processing folder: 009

Processing folder: 010

Processing folder: 011

Processing folder: 012

Processing folder: 013

Processing folder: 014

Processing folder: 015

Processing folder: 016

Processing folder: 017

Processing folder: 018

Processing folder: 019

Processing folder: 020

Processing folder: 021

Processing folder: 022

Processing folder: 023

Processing folder: 024

Processing folder: 025

Processing folder: 026

Processing folder: 027

Processing folder: 028

Processing folder: 029

Processing folder: 030

Processing folder: 031

Processing folder: 032

Processing folder: 033

Processing folder: 034

Processing folder: 035

Processing folder: 036

Processing folder: 037

Processing folder: 038

Processing folder: 039

Process

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_set.loc[:, 'Predicted'] = train_set['EmailMessages'].apply(


Accuracy: 89.35%
Recall: 86.43%
Precision: 97.48%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set.loc[:, 'Predicted'] = test_set['EmailMessages'].apply(
