In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
df=pd.read_json(DATA_JSON_FILE)

In [4]:
df.head()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1,00001.7848dde101aa985090474a91ec93fcf0
1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,##############################################...,1,00004.eac8de8d759b7e74154f142194282724
4,I thought you might like these:\n\n1) Slim Dow...,1,00005.57696a39d7d84318ce497886896bf90d


In [5]:
vectorizer=CountVectorizer(stop_words='english')

In [6]:
all_features=vectorizer.fit_transform(df.MESSAGE)

In [7]:
type(all_features)

scipy.sparse.csr.csr_matrix

In [8]:
all_features

<3000x62660 sparse matrix of type '<class 'numpy.int64'>'
	with 324609 stored elements in Compressed Sparse Row format>

In [9]:
vectorizer.vocabulary_

{'doctype': 20340,
 'html': 28718,
 'public': 44355,
 'w3c': 57215,
 'dtd': 21042,
 'transitional': 53871,
 'en': 22240,
 'head': 27743,
 'meta': 36797,
 'content': 17406,
 '3d': 3669,
 'text': 52883,
 'charset': 15854,
 '3dwindows': 3865,
 '1252': 1193,
 'http': 28729,
 'equiv': 22634,
 '3dcontent': 3777,
 'ype': 60526,
 'mshtml': 37853,
 '00': 0,
 '2314': 2507,
 '1000': 878,
 '3dgenerator': 3793,
 'body': 13803,
 'inserted': 30666,
 'calypso': 15099,
 'table': 52332,
 'border': 13931,
 '3d0': 3670,
 'cellpadding': 15613,
 'cellspacing': 15618,
 '3d2': 3687,
 'id': 29421,
 '3d_calyprintheader_': 3751,
 'ules': 54947,
 '3dnone': 3822,
 'style': 51374,
 'color': 16796,
 'black': 13489,
 'display': 20019,
 'width': 58286,
 '100': 877,
 'tbody': 52551,
 'tr': 53774,
 'td': 52600,
 'colspan': 16809,
 '3d3': 3701,
 'hr': 28646,
 '3dblack': 3768,
 'noshade': 39376,
 'size': 49831,
 '3d1': 3672,
 'end': 22294,
 'font': 24672,
 '000000': 3,
 'face': 23579,
 '3dverdana': 3861,
 'arial': 11302,


In [10]:
X_train,X_test,y_train,y_test=train_test_split(all_features,df['CATEGORY'],test_size=0.3,random_state=88)

In [11]:
classifier=MultinomialNB()
classifier.fit(X_train,y_train)

MultinomialNB()

In [12]:
# number of emails classified correctly
num_correct_docs=(y_test ==classifier.predict(X_test)).sum() /y_test.size
num_correct_docs

0.9711111111111111

In [13]:
y_test.size
pred=classifier.predict(X_test)

In [14]:
print(f'The accuracy of the model is {num_correct_docs:.2%}')

The accuracy of the model is 97.11%


In [15]:
classifier.score(X_test,y_test)

0.9711111111111111

In [16]:
recall_score(y_test,pred)

0.8461538461538461

In [17]:
f1_score(y_test,pred)

0.9029850746268656

In [18]:
precision_score(y_test,pred)

0.968

In [19]:
example = ['get viagra for free now!', 
          'need a mortgage? Reply to arrange a call with a specialist and get a quote', 
          'Could you please help me with the project for tomorrow?', 
          'Hello Jonathan, how about a game of golf tomorrow?', 
          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'
          ]

In [20]:
mess_mat=vectorizer.transform(example)

In [21]:
prediction=classifier.predict(mess_mat)


In [22]:
prediction

array([1, 1, 0, 0, 0], dtype=int64)