In [None]:
# IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn # To be use only for standardization, splitting of datasets, model evaluation and comparison
import seaborn as sns

In [None]:
# Load the dataset (type a)
# I have made other classifier based on the other dataset types which can be accessed in the ./additional_notebooks directory
main_df = pd.read_csv("../preprocessed_files/preprocessed_emails_a.csv")

# Split the dataset into training and testing
# Folders 0-70 are training, 71-126 are testing
train_df = main_df[main_df['folder'] <= 70]
test_df = main_df[main_df['folder'] > 70]

# Split training to spam and ham
train_spam_df = train_df[train_df['category'] == 1]
train_ham_df = train_df[train_df['category'] == 0]

In [None]:
# From training_df, get the top 10000 words and their counts (store this in a dictionary)
dict_words = {}

for index, row in train_df.iterrows():
    for word in str(row['email_message']).split():
        if word in dict_words:
            dict_words[word] += 1
        else:
            dict_words[word] = 1

# Sort the dictionary by value (descending)
sorted_dict = sorted(dict_words.items(), key=lambda x: x[1], reverse=True)

# Get the top 10000 words
top_10000_words = dict(sorted_dict[:10000])
# We will also create a list of the top 10000 words
# This will be useful for the instances where we need just the words
top_10000_words_list = list(top_10000_words.keys())
top_10000_words

In [None]:
# Redefine the classify function to be used with filtered dictionary
def classify(email, spam_word_probs, ham_word_probs, p_spam, p_ham, fil_dict, fil_list):
    # Initialize the log probability of spam and ham
    log_p_spam = 0
    log_p_ham = 0
    
    # Split the email into words
    words = str(email).split()
    
    # Compute the log probability of spam and ham
    for word in words:
        if word in fil_dict:
            log_p_spam += np.log(spam_word_probs[fil_list.index(word)])
            log_p_ham += np.log(ham_word_probs[fil_list.index(word)])
    
    # Add the log probability of spam and ham
    log_p_spam += np.log(p_spam)
    log_p_ham += np.log(p_ham)
    
    # Return the class with the highest probability
    if log_p_spam > log_p_ham:
        return 1
    else:
        return 0

In [None]:
def laplace_smoothing(feature_matrix_spam, feature_matrix_ham, laplace_smoothing_val , num_classes, fil_dict):
    # Initialize the probability of each word given spam and ham
    p_word_given_spam = np.zeros(len(fil_dict))
    p_word_given_ham = np.zeros(len(fil_dict))

    # Calculate the word count for spam and ham
    spam_word_count = np.sum(feature_matrix_spam, axis=0)
    ham_word_count = np.sum(feature_matrix_ham, axis=0)

    # Calculate the total number of words in spam and ham
    total_spam_words = np.sum(spam_word_count)
    total_ham_words = np.sum(ham_word_count)

    # Compute the probability of each word given spam and ham
    for i in range(len(fil_dict)):
        p_word_given_spam[i] = (spam_word_count[i] + laplace_smoothing_val) / (total_spam_words + laplace_smoothing_val * num_classes)
        p_word_given_ham[i] = (ham_word_count[i] + laplace_smoothing_val) / (total_ham_words + laplace_smoothing_val * num_classes)

    return p_word_given_spam, p_word_given_ham

In [None]:
# Computing the Prior Probability of Spam and Ham
# P(Spam) = # of spam emails / total # of emails
# P(Ham) = # of ham emails / total # of emails
n_spam = len(train_spam_df)
n_ham = len(train_ham_df)
n_train = len(train_df)

# P(Spam)
p_spam = n_spam / n_train

# P(Ham)
p_ham = n_ham / n_train

print(f"P(spam) = {p_spam}")
print(f"P(ham) = {p_ham}")

In [None]:
sorted_dict = dict(sorted_dict)
sorted_dict
k=1

# Filter the dictionary (for k = 50)
filtered_dict = {x: y for x, y in sorted_dict.items() if y > k}
filtered_list = list(filtered_dict.keys())

# Creation of Feature Matrix
feature_matrix_spam = np.zeros((len(train_spam_df), len(filtered_list)))
# Settting the feature matrix for spam
for index in range(len(train_spam_df)):
    for word in str(train_spam_df.iloc[index]['email_message']).split():
        if word in filtered_dict:
            feature_matrix_spam[index][filtered_list.index(word)] = 1
feature_matrix_ham = np.zeros((len(train_ham_df), len(filtered_list)))
# Setting the feature matrix for ham
for index in range(len(train_ham_df)):
    for word in str(train_ham_df.iloc[index]['email_message']).split():
        if word in filtered_dict:
            feature_matrix_ham[index][filtered_list.index(word)] = 1

In [None]:
# Export the model for later use
import pickle

with open('./exported_model/olarte-spam_classifier', 'wb') as f:
    pickle.dump([p_spam, p_ham, filtered_dict, filtered_list, feature_matrix_ham, feature_matrix_spam, classify], f)