In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [4]:
os.chdir("/Users/kausshik/HON322M/Lab4/SpamEmailDetector")

# Understand data 

In [5]:
data = pd.read_csv('Email_data.csv')

In [6]:
data

Unnamed: 0.1,Unnamed: 0,message,label
0,0,AccuWeather 7-Day Forecast for \nBEVERLY HILLS...,0
1,1,"Mark,\n\nYou're right. Thank you. Look, it w...",0
2,2,"Thanks for this information, Bjorn.\nThis same...",0
3,3,"On Mon, Apr 23, 2007 at 05:33:51PM +0200, Alex...",0
4,4,"Content-Type: text/plain;\n\tcharset=""Windows-...",1
...,...,...,...
7995,7995,"Content-Type: text/plain;\n charset=""iso-88...",1
7996,7996,Content-Type: multipart/alternative;\n\tbounda...,1
7997,7997,"On 4/10/07, John W. Krahn wrote:\n> Igor Sutto...",0
7998,7998,Content-Type: multipart/alternative;\n\tbounda...,1


In [7]:
# Randomly select one spam email
spam_email_sample = data[data['label'] == 1].sample(n=1)

# Randomly select one not spam email
non_spam_email_sample = data[data['label'] == 0].sample(n=1)

# Print the samples
print("Spam email sample:")
print(spam_email_sample['message'].values[0])
print("=" * 100)
print("Non-spam email sample:")
print(non_spam_email_sample['message'].values[0])


Spam email sample:
We Present you a US Licensed Online Pharm4cy St0re. 
Huge Disc0unts for next five days!!! 

Phenterm1ne - as low as $6.30
Cial1s S0ft Tabs - as low as $7.30 
Amb1en - as low as $3.60 
V1agra S0ft Tabs - as low as $4.10 
Val1um - as low as $3.30 
Cial1s - as low as $6.00 
Xan4x - as low as $3.80 
Merid1a - as low as $4.40 

We have lightspeed delivery and respect your privacy! 
Check US He4lthc4re Inc. at: http://www.pollikees.com/
Non-spam email sample:
--
[ Picked text/plain from multipart/alternative ]
Intellectual Property Management in Health and
Agricultural Innovation: A Handbook of Best
Practices has been released earlier last week at
BIO in Boston. Prepared by and for policy-makers,
leaders of public sector research establishments,
technology transfer professionals, licensing
executives, and scientists, the Handbook offers
up-to-date information and strategies for
utilizing the power of both intellectual property
and the public domain. Eschewing ideological
d

In [8]:
data['label'].value_counts()

label
0    4000
1    4000
Name: count, dtype: int64

# Data pre-processing

## Split data

In [9]:
X = list(data['message'])

In [10]:
y = list(data['label'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.30, 
                                                    random_state = 369,    # produce the same split when run the code multiple times
                                                    stratify = y)    # have same percentage of samples of each target class; e.g., 28% for test for each class

In [12]:
pd.Series(y_train).value_counts()

0    2800
1    2800
Name: count, dtype: int64

In [13]:
pd.Series(y_test).value_counts()

0    1200
1    1200
Name: count, dtype: int64

In [14]:
# Add Lemmatization by extending CountVectorizer
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [15]:
# use LemmaCountVectorizer to perform tokenization, lowercasing, stopwords removing, and lemmatization
Lema_cv = LemmaCountVectorizer(stop_words = 'english')      
Lema_cv_feature_word_train = Lema_cv.fit_transform(X_train).toarray()
Lema_cv_feature_word_test = Lema_cv.transform(X_test).toarray()

In [16]:
Lema_cv_feature_word_train.shape

(5600, 316067)

## Model Training - Naive Bayes

In [19]:
# Instantiate a Multinomial Naive Bayes classifier
mnb = MultinomialNB()

# Fit the Multinomial Naive Bayes classifier to the training data.
mnb.fit(Lema_cv_feature_word_train, y_train)

# Use the trained classifier to predict the labels for the test set.
predicted_y = mnb.predict(Lema_cv_feature_word_test)

## EXECUTION TIMED OUT

: 

In [None]:
conf_matrix = confusion_matrix(y_test, predicted_y)       #(y, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Prediction')
plt.ylabel('Ground Truth')
plt.show()


print(classification_report(y_test, predicted_y))

NameError: name 'confusion_matrix' is not defined

 #### The Naive Bayes model predicted the class label with a 95% accuracy. The result demonstrates how well the simple algorithem, a Naive Bayes classifier, can perform for spam email detection.