## Implementing email spam classification with built in library in naiye bayes classifier

In [17]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import recall_score,f1_score,precision_score

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split


In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [5]:
data.shape

(5796, 3)

In [6]:
data['MESSAGE'][995]

'<HTML><BODY BGCOLOR=3D"#FFFFFF">\n\n<table width=3D400 align=3Dcenter cellpadding=3D5 border=3D0><tr><td>\n\n<font face=3D"arial" size=3D1.5><p>\n\nThe following advertisement is being sponsored by \n\n<a href=3D"http://www.avirtualshopper.com">AVIRTUALSHOPPER.COM</a>\n\nThe Internets Leading source for permission based opt-in marketing<br>\n\nTo Opt-Out from our mailing list<a href=3D"http://www.avirtualshopper.com/=\n\nremoveME.asp"> CLICK HERE</a></p></font>\n\n</td></tr></table><font face=3Darial size=3D3 color=3D"#FFFFFF"><p align=3D=\n\n"center">...</p></font>\n\n<CENTER><TABLE BORDER=3D"0" CELLPADDING=3D"0" CELLSPACING=3D"0" WIDTH=3D"5=\n\n02">\n\n<TR><TD VALIGN=3DTOP><a href=3D"http://www.hebalist.com?id=3D604"><IMG SRC=\n\n=3D"http://www.hebalist.com/mail/1_1.jpg" width=3D"226" height=3D"183" BOR=\n\nDER=3D"0"></a></TD>\n\n<TD VALIGN=3DTOP>\n\n<IMG SRC=3D"http://www.hebalist.com/mail/2_1.gif" width=3D"276" height=3D"=\n\n183" BORDER=3D"0"></TD>\n\n</TR><TR><TD COLSPAN=3D"2" V

In [7]:
data.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [11]:
vectorizer = CountVectorizer()

In [13]:
all_features = vectorizer.fit_transform(data['MESSAGE'])

In [14]:
all_features.shape

(5796, 103002)

In [16]:
# vectorizer.vocabulary_

In [18]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data['CATEGORY'], random_state = 88, test_size=0.3)

In [19]:
X_train.shape

(4057, 103002)

In [20]:
X_test.shape

(1739, 103002)

In [21]:
y_train.shape

(4057,)

In [22]:
y_test.shape

(1739,)

In [23]:
classifier = MultinomialNB()

In [24]:
classifier.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
predictions = classifier.predict(X_test)

In [50]:
y_test.iloc[8]

0

In [51]:
predictions[8]

0

In [32]:
predictions

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [54]:
nr_correct = (y_test == predictions).sum()
print(f' Number of documents classified correctly : {nr_correct}')

 Number of documents classified correctly : 1639


In [55]:
nr_incorrect = y_test.size - nr_correct
print(f'Number of documents classified incorrectly : {nr_incorrect}')

Number of documents classified incorrectly : 100


In [58]:
accuracy = nr_correct / (nr_correct + nr_incorrect)
print(f' accuracy of model : {round(accuracy * 100,2)}')

 accuracy of model : 94.25


In [60]:
classifier.score(X_test,y_test) # Built in method for calculating score

0.9424956871765382

## Calculating f-score,recall-score,precision

In [61]:
recall_score(y_test,predictions)

0.8267148014440433

In [62]:
precision_score(y_test,predictions)

0.9913419913419913

In [63]:
f1_score(y_test,predictions)

0.9015748031496064

In [64]:
example = ['get viagra for free',
          'could you help me with project tomarrow',
          'click here to get free money',
          'load the dataset']


In [65]:
example_matrix = vectorizer.transform(example)

In [66]:
classifier.predict(example_matrix)

array([1, 0, 1, 0], dtype=int64)