In [31]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'

In [3]:
data = pd.read_json(DATA_JSON_FILE)

In [4]:
data.tail()

Unnamed: 0,CATEGORY,FILE_NAME,MESSAGE
995,1,00250.ae302eda2386979f6ac6bfff9e9f7137,"<HTML><BODY BGCOLOR=3D""#FFFFFF"">\n\n<table wid..."
996,1,01023.1f96236c94e92482c058e950ccd7a590,Long time no chat!\n\n\n\nHow have you been? I...
997,1,00013.372ec9dc663418ca71f7d880a76f117a,"\n\n\n\nChina's rapid economic growth, as rank..."
998,1,00591.962cc31322a42abd7ca205b62c56438e,"<html>\n\n\n\n<body>\n\n\n\n<font size=""2"" PTS..."
999,1,00404.deea51c7b46665faf98fe6c5b5f88810,FUTURE TECH INTERNATIONAL\n\n\n\nSPECIAL OFFER...


In [5]:
data.shape

(5796, 3)

In [6]:
data.sort_index(inplace=True)

In [7]:
data.tail()

Unnamed: 0,CATEGORY,FILE_NAME,MESSAGE
5791,0,00609.dd49926ce94a1ea328cce9b62825bc97,"I'm one of the 30,000 but it's not working ver..."
5792,0,00957.e0b56b117f3ec5f85e432a9d2a47801f,Damien Morton quoted:\n\n>W3C approves HTML 4 ...
5793,0,01127.841233b48eceb74a825417d8d918abf8,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\..."
5794,0,01178.5c977dff972cd6eef64d4173b90307f0,"Once upon a time, Manfred wrote :\n\n\n\n> I w..."
5795,0,00747.352d424267d36975a7b40b85ffd0885e,"If you run Pick, and then use the ""New FTOC"" b..."


In [9]:
vectorizer = CountVectorizer(stop_words='english')

In [10]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [11]:
all_features.shape

(5796, 102694)

In [12]:
vectorizer.vocabulary_

{'dear': 32719,
 'homeowner': 48034,
 'rates': 76350,
 'lowest': 59365,
 'point': 72297,
 '40': 7824,
 'years': 98506,
 'help': 47200,
 'best': 23129,
 'rate': 76347,
 'situation': 82318,
 'matching': 60930,
 'needs': 64750,
 'hundreds': 48607,
 'lenders': 58021,
 'home': 48006,
 'improvement': 51399,
 'refinance': 77074,
 'second': 80968,
 'mortgage': 63026,
 'equity': 38990,
 'loans': 59058,
 'perfect': 70478,
 'credit': 30975,
 'service': 81359,
 '100': 1496,
 'free': 42773,
 'owners': 68715,
 'new': 64988,
 'buyers': 25617,
 'obligation': 66813,
 'just': 55049,
 'quick': 75547,
 'simple': 82172,
 'form': 42425,
 'jump': 55000,
 'start': 84135,
 'future': 43330,
 'plans': 71939,
 'today': 88039,
 'visit': 92921,
 'http': 48497,
 '61': 10092,
 '145': 2275,
 '116': 1873,
 '186': 2748,
 'user0201': 91339,
 'index': 51639,
 'asp': 20429,
 'afft': 17606,
 'qm10': 75108,
 'unsubscribe': 90955,
 'light': 58472,
 'watch': 94281,
 'attention': 20740,
 'computer': 29755,
 'users': 91367,
 'sp

In [14]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, 
                                                   test_size=0.3, random_state=88)

In [16]:
X_train.shape

(4057, 102694)

In [18]:
X_test.shape

(1739, 102694)

In [20]:
classifier = MultinomialNB()

In [21]:
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Challenge:** Calculate the following for the test dataset: <br>
The number of documents classified correctly. <br>
The number of documents classified incorrectly. <br>
The accuracy of the model. <br>

In [24]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [25]:
print(f'{nr_correct} documents classfied correctly')

1660 documents classfied correctly


In [26]:
nr_incorrect = y_test.size - nr_correct

In [27]:
print(f'Number of documents incorrectly classified is {nr_incorrect}')

Number of documents incorrectly classified is 79


In [29]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 95.46%


In [30]:
classifier.score(X_test, y_test)

0.9545715928694652

**Challenge:** For the testing dataset calculate the recall, precision and f1 score. Google for the scikit learn documentation on this topic to work it out. 

In [32]:
recall_score(y_test, classifier.predict(X_test))

0.8646209386281588

In [33]:
precision_score(y_test, classifier.predict(X_test))

0.9917184265010351

In [34]:
f1_score(y_test, classifier.predict(X_test))

0.9238187078109932

In [35]:
example = ['get viagra for free now!', 
          'need a mortgage? Reply to arrange a call with a specialist and get a quote', 
          'Could you please help me with the project for tomorrow?', 
          'Hello Jonathan, how about a game of golf tomorrow?', 
          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'
          ]

In [36]:
doc_term_matrix = vectorizer.transform(example)

In [37]:
classifier.predict(doc_term_matrix)

array([1, 1, 0, 0, 0])