# Naive Bayes Using Bernoulli

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [51]:
df.head(n = 10)

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [52]:
df.shape

(5572, 5)

In [53]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)

In [54]:
df.shape

(5572, 2)

In [55]:
df.head(n=10)

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [56]:
np.unique(df['class'])

array(['ham', 'spam'], dtype=object)

In [57]:
np.unique(df['message'])

array([' &lt;#&gt;  in mca. But not conform.',
       ' &lt;#&gt;  mins but i had to stop somewhere first.',
       ' &lt;DECIMAL&gt; m but its not a common car here so its better to buy from china or asia. Or if i find it less expensive. I.ll holla',
       ..., 'ÌÏ thk of wat to eat tonight.', 'ÌÏ v ma fan...',
       'ÌÏ wait 4 me in sch i finish ard 5..'], dtype=object)

In [58]:
x = df['message'].values
y = df['class'].values

cv = CountVectorizer()  # Creating a count vectorizer object

x = cv.fit_transform(x) # transforming values
v = x.toarray()

print("Sparse Matrix: ", v)

Sparse Matrix:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [59]:
first_col = df.pop('message')
df.insert(0, 'message', first_col)
df

Unnamed: 0,message,class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [60]:
train_x = x[:4179]
train_y = y[:4179]

test_x = x[4179:]
test_y = y[4179:]

In [61]:
bnb = BernoulliNB(binarize = 0.0)
model = bnb.fit(train_x, train_y)
y_pred_train = bnb.predict(train_x)
y_pred_test = bnb.predict(test_x)

In [62]:
print(bnb.score(train_x, train_y)*100)
print(bnb.score(test_x, test_y)*100)

98.73175400813592
98.20531227566404


In [63]:
#For training set 
from sklearn.metrics import classification_report
print(classification_report(train_y, y_pred_train))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      3614
        spam       0.99      0.91      0.95       565

    accuracy                           0.99      4179
   macro avg       0.99      0.96      0.97      4179
weighted avg       0.99      0.99      0.99      4179



In [64]:
#For testing set 
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred_test))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1211
        spam       0.99      0.87      0.93       182

    accuracy                           0.98      1393
   macro avg       0.99      0.93      0.96      1393
weighted avg       0.98      0.98      0.98      1393

