In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [None]:
os.chdir('/Users/sheng/Jupyter/HON322M')

# Understand data 

In [None]:
data = pd.read_csv('Email_data.csv')

In [None]:
data

In [None]:
# Randomly select one spam email
spam_email_sample = data[data['label'] == 1].sample(n=1)

# Randomly select one not spam email
non_spam_email_sample = data[data['label'] == 0].sample(n=1)

# Print the samples
print("Spam email sample:")
print(spam_email_sample['message'].values[0])
print("=" * 100)
print("Non-spam email sample:")
print(non_spam_email_sample['message'].values[0])


In [None]:
data['label'].value_counts()

# Data pre-processing

## Split data

In [None]:
X = list(data['message'])

In [None]:
y = list(data['label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    random_state = 369,    # produce the same split when run the code multiple times
                                                    stratify = y)    # have same percentage of samples of each target class; e.g., 28% for test for each class

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
# Add Lemmatization by extending CountVectorizer
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [None]:
# use LemmaCountVectorizer to perform tokenization, lowercasing, stopwords removing, and lemmatization
Lema_cv = LemmaCountVectorizer(stop_words = 'english')      
Lema_cv_feature_word_train = Lema_cv.fit_transform(X_train).toarray()
Lema_cv_feature_word_test = Lema_cv.transform(X_test).toarray()

In [None]:
Lema_cv_feature_word_train.shape

## Model Training - Naive Bayes

In [None]:
# Instantiate a Multinomial Naive Bayes classifier
mnb = MultinomialNB()

# Fit the Multinomial Naive Bayes classifier to the training data.
mnb.fit(Lema_cv_feature_word_train, y_train)

# Use the trained classifier to predict the labels for the test set.
predicted_y = mnb.predict(Lema_cv_feature_word_test)

In [None]:
conf_matrix = confusion_matrix(y_test, predicted_y)       #(y, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Prediction')
plt.ylabel('Ground Truth')
plt.show()


print(classification_report(y_test, predicted_y))

 #### The Naive Bayes model predicted the class label with a 95% accuracy. The result demonstrates how well the simple algorithem, a Naive Bayes classifier, can perform for spam email detection.