In [91]:
# DATA ANALYSIS
import numpy as np
import pandas as pd

# GENERATE VOCABULARY
from sklearn.feature_extraction.text import CountVectorizer

# SPLIT DATA
from sklearn.model_selection import train_test_split

# NAIVE BAYES MODEL
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import recall_score, f1_score, precision_score

In [92]:
DATA_JASON_FILE = './SpamData/01_Processing/email-text-data.jason'

In [93]:
data = pd.read_json(DATA_JASON_FILE)

In [94]:
data.tail()

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
5791,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5792,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5793,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
5794,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896
5795,"Hi there,\n\n\n\nNow this is probably of no us...",0,01400.f897f0931e461e7b2e964d28e927c35e


In [95]:
vectorizer = CountVectorizer(stop_words='english')

In [96]:
all_features = vectorizer.fit_transform(data.MESSAGE)

In [97]:
all_features.shape

(5796, 102694)

In [98]:
type(all_features)

scipy.sparse.csr.csr_matrix

In [99]:
vectorizer.vocabulary_

{'doctype': 34865,
 'html': 48472,
 'public': 74013,
 'w3c': 93790,
 'dtd': 36354,
 'transitional': 88580,
 'en': 38432,
 'head': 47011,
 'meta': 61701,
 'content': 30249,
 '3d': 6385,
 'text': 86991,
 'charset': 27796,
 '3dwindows': 7297,
 '1252': 2025,
 'http': 48497,
 'equiv': 38991,
 '3dcontent': 6908,
 'ype': 99054,
 'mshtml': 63412,
 '00': 0,
 '2314': 4235,
 '1000': 1497,
 '3dgenerator': 6987,
 'body': 24390,
 'inserted': 52119,
 'calypso': 26557,
 'table': 86120,
 'border': 24581,
 '3d0': 6386,
 'cellpadding': 27375,
 'cellspacing': 27383,
 '3d2': 6525,
 'id': 49828,
 '3d_calyprintheader_': 6758,
 'ules': 90247,
 '3dnone': 7130,
 'style': 84723,
 'color': 29367,
 'black': 23893,
 'display': 34406,
 'width': 95488,
 '100': 1496,
 'tbody': 86462,
 'tr': 88442,
 'td': 86548,
 'colspan': 29390,
 '3d3': 6570,
 'hr': 48373,
 '3dblack': 6863,
 'noshade': 65849,
 'size': 82347,
 '3d1': 6417,
 'end': 38496,
 'font': 42257,
 '000000': 4,
 'face': 40498,
 '3dverdana': 7283,
 'arial': 20116

In [100]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size=0.3, random_state=88)

In [101]:
X_test.shape

(1739, 102694)

In [102]:
classifier = MultinomialNB()

In [103]:
classifier.fit(X_train, y_train)

MultinomialNB()

**Challenge:** ```Calculate the Following on the test dataset: ``` <br>
The Number of documents classified correctly <br>
The Number of documents classified incorrectly <br>
the Accuracy of the model <br>

In [104]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [105]:
print(f'{nr_correct} Documents classified correctly.')

1641 Documents classified correctly.


In [106]:
nr_incorrect = (y_test.size - nr_correct)

In [107]:
print(f'{nr_incorrect} Documents classified incorrectly.')

98 Documents classified incorrectly.


In [108]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)

In [109]:
print(f'The (Testing) Accuracy of the model is {1-fraction_wrong:.2%}')

The (Testing) Accuracy of the model is 94.36%


In [110]:
fraction_wrong

0.05635422656699252

In [111]:
classifier.score(X_test, y_test)

0.9436457734330075

```Challenge:```  For the Testing dataset calculate the recall, precision and f1 score. Google for the scikit learn documentation

In [112]:
recall_score_ = recall_score(y_test, classifier.predict(X_test))
print(f'Recall Score is {recall_score_:2%}')

Recall Score is 83.032491%


In [113]:
precision_score_ = precision_score(y_test, classifier.predict(X_test))
print(f'Precision Score is: {precision_score_:2%}')

Precision Score is: 99.137931%


In [114]:
f1_score_ = f1_score(y_test, classifier.predict(X_test))
print(f'F1-Score is: {f1_score_:2%}')

F1-Score is: 90.373281%


In [200]:
examples = ['Hello Friend, How are you',
           'pay us and get free bitcoin',
           'Hello friend, pay 200 dollar to get fri bitcon. and fri dollars',
           'Alamin, You will you come to class tommorow']

In [201]:
doc_term_matrix = vectorizer.transform(examples)

In [202]:
classifier.predict(doc_term_matrix)

array([0, 1, 0, 0], dtype=int64)