In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
text =['The  Himalayan Mountains', ' And River Ganges', 'A red racing car', 'An Apple a day']
vectorizer = CountVectorizer(stop_words = 'english')
my_vector=vectorizer.fit_transform(text)
print(vectorizer.vocabulary_)

{'himalayan': 4, 'mountains': 5, 'river': 8, 'ganges': 3, 'red': 7, 'racing': 6, 'car': 1, 'apple': 0, 'day': 2}


In [3]:
numpy_data =(my_vector.toarray())

In [4]:
numpy_data

array([[0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [5]:
df = pd.DataFrame(data=numpy_data, index=  list( range(0,my_vector.shape[0])), columns=list(range(0,my_vector.shape[1])))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,1,1,0,0,0
1,0,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,1,0
3,1,0,1,0,0,0,0,0,0


In [6]:
f = 'SpamData/01_Processing/email-text-data.json'
data = pd.read_json(f)

In [7]:
data

Unnamed: 0,MESSAGE,CATEGORY,FILE_NAME
0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1,00001.7848dde101aa985090474a91ec93fcf0
1,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00002.d94f1b97e48ed3b553b3508d116e6a09
2,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,1,00003.2ee33bc6eacdb11f38d052c44819ba6c
3,##############################################...,1,00004.eac8de8d759b7e74154f142194282724
4,I thought you might like these:\n\n1) Slim Dow...,1,00005.57696a39d7d84318ce497886896bf90d
...,...,...,...
5795,http://news.bbc.co.uk/1/hi/england/2515127.stm...,0,01396.61983fbe6ec43f55fd44e30fce24ffa6
5796,"> >-- be careful when using this one.) Also, t...",0,01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7
5797,">>>>> ""SM"" == Skip Montanaro <skip@pobox.com> ...",0,01398.169b51731fe569f42169ae8f948ec676
5798,"So then, ""Mark Hammond"" <mhammond@skippinet.co...",0,01399.ca6b00b7b341bbde9a9ea3dd6a7bf896


In [9]:
data.sort_index(inplace= True)

In [10]:
vectorizer = CountVectorizer(stop_words='english')
my_vector = vectorizer.fit_transform(data.MESSAGE)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(my_vector, data.CATEGORY, 
                                                   test_size=0.3, random_state=88)

In [12]:
X_train 

<4060x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 489900 stored elements in Compressed Sparse Row format>

In [13]:
X_test

<1740x102694 sparse matrix of type '<class 'numpy.int64'>'
	with 214784 stored elements in Compressed Sparse Row format>

In [14]:
y_train

292     1
302     1
2113    0
3416    0
2004    0
       ..
4709    0
362     1
2481    0
4047    0
2008    0
Name: CATEGORY, Length: 4060, dtype: int64

In [15]:
y_test

3510    0
4516    0
3689    0
3186    0
4963    0
       ..
985     1
4021    0
3927    0
4083    0
1675    1
Name: CATEGORY, Length: 1740, dtype: int64

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [18]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
nr_correct

1634

In [19]:
nr_incorrect = (y_test != classifier.predict(X_test)).sum()
nr_incorrect

106

In [20]:
nr_incorrect = y_test.size - nr_correct
nr_incorrect

106

In [21]:
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 93.91%


In [22]:
classifier.score(X_test, y_test)

0.9390804597701149

In [23]:
from sklearn.metrics import recall_score, precision_score, f1_score

In [24]:
recall_score(y_test, classifier.predict(X_test))

0.8288288288288288

In [25]:
precision_score(y_test,classifier.predict(X_test))

0.9766454352441614

In [26]:
f1_score(y_test,classifier.predict(X_test))

0.8966861598440546

In [27]:
example = ['Hello friend want free housing loan ' , 'get free bitcoins', 'Do you want free car loans without repayment']

In [28]:
classifier.predict(vectorizer.transform(example))

array([1, 1, 1], dtype=int64)