In [33]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [5]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
5794,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
5795,"Hi there,\n\nNow this is probably of no use to...",0,01400.f897f0931e461e7b2e964d28e927c35e


In [6]:
data.shape

(5796, 3)

In [8]:
data.sort_index(inplace = True)

In [9]:
data

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1,00001.7848dde101aa985090474a91ec93fcf0
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,##############################################...,1,00004.eac8de8d759b7e74154f142194282724
4,I thought you might like these:\n1) Slim Down ...,1,00005.57696a39d7d84318ce497886896bf90d
...,...,...,...
5791,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
5794,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896


In [11]:
vectorizer = CountVectorizer(stop_words='english' , )

In [12]:
all_features = vectorizer.fit_transform( data.MESSAGE )

In [14]:
all_features.shape # sparse matrix

(5796, 102694)

In [19]:
# vectorizer.vocabulary_

In [17]:
X_train, X_test , y_train , y_test = train_test_split( all_features , data.CATEGORY , test_size = 0.3 , random_state = 88)

In [18]:
X_train.shape

(4057, 102694)

In [21]:
classifier = MultinomialNB()

In [22]:
classifier.fit( X_train , y_train)

MultinomialNB()

In [24]:
nr_correct = ( y_test == classifier.predict(X_test) ).sum()

In [25]:
nr_correct

1641

In [26]:
nr_incorrect = ( y_test != classifier.predict(X_test) ).sum()

In [27]:
nr_incorrect

98

In [29]:
fraction_wrong = ( nr_correct ) / ( nr_correct + nr_incorrect)
fraction_wrong

0.9436457734330075

In [31]:
classifier.score(X_test , y_test)

0.9436457734330075

In [34]:
recall_score(y_test , classifier.predict(X_test))

0.8303249097472925

In [35]:
precision_score(y_test , classifier.predict(X_test))

0.9913793103448276

In [36]:
f1_score(y_test , classifier.predict(X_test))

0.9037328094302555

# Using the model now

In [37]:
example = ['get viagra free now',
            'need a mortgage? Call us now at 012930120',
            'Could you please help me with the project tomorrow?',
            'Hello Jonathan, how about a game of golf tomorrow?',
            'sdjbvjd[oeecevoejojojcs]sdcklsnvdnedvnerivnernvernvien iernvienrvinervne rvm,e lnripjvpeormvpe vke eormpvoemrpvekrlvne'
          ]

In [38]:
doc_term_matrix = vectorizer.transform(example)


In [39]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0], dtype=int64)