# Import Modules

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
DATA_JSON_FILE = "email-data.json"

In [3]:
df = pd.read_json(DATA_JSON_FILE)

In [4]:
df

Unnamed: 0,EMAIL,LABEL,FILE_NAME
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0,0
1,"Martin A posted:\n\nTassos Papadopoulos, the G...",0,1
2,Man Threatens Explosion In Moscow \n\n\n\nThur...,0,2
3,Klez: The Virus That Won't Die\n\n \n\nAlready...,0,3
4,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",0,4
...,...,...,...
3046,There is NO stumbling on to it! \n\n\n\nThe gr...,1,3047
3047,This is a multi-part message in MIME format.\n...,1,3048
3048,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont...",1,3049
3049,"<HTML>\n\n<BODY>\n\n <tr valign=3D""top""> \n\n...",1,3050


In [6]:
df.sort_index(inplace=True)

In [7]:
df

Unnamed: 0,EMAIL,LABEL,FILE_NAME
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0,0
1,"Martin A posted:\n\nTassos Papadopoulos, the G...",0,1
2,Man Threatens Explosion In Moscow \n\n\n\nThur...,0,2
3,Klez: The Virus That Won't Die\n\n \n\nAlready...,0,3
4,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",0,4
...,...,...,...
3046,There is NO stumbling on to it! \n\n\n\nThe gr...,1,3047
3047,This is a multi-part message in MIME format.\n...,1,3048
3048,"<html>\n\n\n\n<head>\n\n<meta http-equiv=""Cont...",1,3049
3049,"<HTML>\n\n<BODY>\n\n <tr valign=3D""top""> \n\n...",1,3050


## Tokenization

In [8]:
vectorizer = CountVectorizer(stop_words='english')

In [9]:
features = vectorizer.fit_transform(df.EMAIL)

In [10]:
features.shape

(3051, 61735)

## Train test split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features,df.LABEL, test_size=0.3, random_state=101)

## Implementing Naive Bayes Model

In [20]:
model = MultinomialNB()

In [21]:
model.fit(X_train,y_train)

MultinomialNB()

In [24]:
cm = confusion_matrix(y_test, model.predict(X_test))

In [25]:
cm

array([[776,   2],
       [ 26, 112]], dtype=int64)

In [26]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       778
           1       0.98      0.81      0.89       138

    accuracy                           0.97       916
   macro avg       0.98      0.90      0.94       916
weighted avg       0.97      0.97      0.97       916

