In [None]:
# Import library
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import words
import ast
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
origin_data = pd.read_csv('data\emails.csv')
print(origin_data.shape)
origin_data.head()

In [None]:
def split_data(data, train_size):
    train_data = data[:int(len(data) * train_size)]
    test_data = data[int(len(data) * train_size):]
    return train_data, test_data

In [None]:
def save_data(data, filename):
    data.to_csv(filename, index=False, na_rep="")

In [None]:
origin_data = origin_data.sample(frac = 1,replace=False)
train_data, test_data = split_data(origin_data, 0.9)

print (len(train_data))
print (len(test_data))

save_data(train_data, "data/train.csv")
save_data(test_data, "data/test.csv")


In [None]:
test= pd.read_csv('data/test.csv')
test.head()

In [None]:
# Load train dataset
data = pd.read_csv('data/train.csv')
print(data.shape)

In [None]:
# load set of words to check English words
nltk.download("words")

set_words = set(words.words())

print(list(set_words)[:10])
print(len(set_words))

In [None]:
# Create a vocabulary from the loaded train dataset
vocabulary = {}
for i in range(data.shape[0]):
        
        current_email = data.iloc[i, :][0].split()
        print(
            f"Email number: {i} out of {data.shape[0]} \
            Length of vocab: {len(vocabulary)}"
        )
        for word in current_email:
            idx = len(vocabulary)
            if word.lower() not in vocabulary and word.lower() in set_words:
                vocabulary[word] = idx
                idx += 1
            
file = open("data/vocabulary.txt", "w")
file.write(str(vocabulary))
file.close()           

In [None]:
# Extract frequent feature from the imported dataset
def extract_freq (data):
    file = open("data/vocabulary.txt", "r")
    contents = file.read()
    vocabulary = ast.literal_eval(contents)

    X = np.zeros((data.shape[0], len(vocabulary)))
    y = np.zeros((data.shape[0]))

    for i in range(data.shape[0]):
        email = data.iloc[i, :][0].split()

        for email_word in email:
            if email_word.lower() in vocabulary:
                X[i, vocabulary[email_word.lower()]] += 1
            else:
                continue
        y[i] = data.iloc[i, :][1]
    
    return X, y

In [None]:
data = pd.read_csv("data/train.csv")
X_train,y_train = extract_freq(data)
np.save("data/X_train.npy", X_train)
np.save("data/y_train.npy", y_train)

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)


In [None]:
test_data = pd.read_csv("data/test.csv")
X_test,y_test = extract_freq(test_data)
np.save("data/X_test.npy", X_test)
np.save("data/y_test.npy", y_test)

print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

In [None]:
# Naive Bayes from scikit learn
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

# Load data and train model
X_sk = np.load("data/X_train.npy")
y_sk = np.load("data/y_train.npy")
model.fit(X_sk , y_sk )

# predict on train set
predictions = model.predict(X_sk)
accuracy = accuracy_score(y_sk, predictions)
cm = confusion_matrix(y_sk, predictions)
print("Accuracy on train set:", accuracy)
print(cm)

#predict on test set
X_test = np.load("data/X_test.npy")
y_test = np.load("data/y_test.npy")

predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
cm = confusion_matrix(y_test, predictions)
print("Accuracy on test set:", accuracy)
print(cm)

In [None]:
true_positives = cm[1][1]
false_positives = cm[0][1]
false_negatives = cm[1][0]
true_negatives = cm[0][0]

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * precision * recall / (precision + recall)
print ("Precision: ", precision)
print ("Recall: ", recall)
print ("F1 Score: ", f1_score)

In [None]:
plt.figure(figsize=(8,6), dpi=100)
sns.set(font_scale = 1.1) 
ax = sns.heatmap(cm, annot=True, fmt='d', )

# set x-axis label and ticks. 
ax.set_xlabel("Predicted Spam", fontsize=14, labelpad=20)
ax.xaxis.set_ticklabels(['Negative', 'Positive'])

# set y-axis label and ticks
ax.set_ylabel("Actual Spam", fontsize=14, labelpad=20)
ax.yaxis.set_ticklabels(['Negative', 'Positive'])

# set plot title
ax.set_title("Confusion Matrix for the Spam Detection Model", fontsize=14, pad=20)

plt.show()

In [None]:
# Extract frequent feature from the imported dataset
def get_freq (data):
    file = open("data/vocabulary.txt", "r")
    contents = file.read()
    vocabulary = ast.literal_eval(contents)

    X = np.zeros((data.shape[0], len(vocabulary)))
    y = np.zeros((data.shape[0]))

    for i in range(data.shape[0]):
        email = data.iloc[i, :][0].split()
        for email_word in email:
            if email_word.lower() in vocabulary:
                X[i, vocabulary[email_word.lower()]] += 1
            else:
                continue
    return X, y

In [None]:
# Spam Detection engine using Naive Bayes Classifier
input_email = pd.read_csv("data/real_test.csv")
actual_spam = 0
for i in range(len(input_email)):
    if input_email.loc[i, "class"] == 1:
        actual_spam += 1
print (f"Number of spam email in the input: {actual_spam}")
input_data_features,_ = extract_freq(input_email)

print(f"Input shape: {input_data_features.shape}")
prediction = model.predict(input_data_features)

spam = 0
for i in range(len(prediction)):
    if prediction[i] == 1:
        spam += 1
print(f"Number of spam email detected: {spam}")


In [None]:
email_no = 10
if (prediction[email_no-1]==1):
    print(f'Email number {email_no} is a spam email')
else:
    print(f'Email number {email_no} is not a spam email')