# Naive Bayes Classifier

In [49]:
import pandas as pd
import numpy as np
import os

import urllib
from urllib.request import urlopen
import urllib.request as ur

import sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn.metrics import accuracy_score

In [50]:
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data")

raw_data = ur.urlopen(url)

dataset = np.loadtxt(raw_data, delimiter = ",")

print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [51]:
# let's analyze 48 features.

X = dataset[:, 0:48]
y = dataset[:, -1]

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state = 420)

In [41]:
BernNB = BernoulliNB(binarize = True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)
0.8525345622119815


### Bernoulli Naive Bayes Accuracy: 0.85. Which defines model can detect 85% accurate whether the given news is spam or not spam.

In [44]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

y_pred = MultiNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

MultinomialNB()
0.8736010533245556


In [46]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

y_pred = GausNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

GaussianNB()
0.804476629361422


In [48]:
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)
0.880184331797235


### Bernoulli Naive Bayes Accuracy: 0.89. with chnages in binarize hyperparameter. Which defines model can detect 88% accurate whether the given news is spam or not spam.¶

## Text Analyzing with Spam vs Not Spam dataset.

In [None]:
os.chidr("path to the spam dataset file")

In [None]:
df = pd.read_csv("spam.csv", encoding = "cp1252")

In [18]:
df = df.iloc[: , [0,1]]

In [19]:
df.head()

Unnamed: 0,0,0.64
0,0.21,0.28
1,0.06,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [21]:
# rename ham and spam as Status and message

df.rename(columns = {"v1": "status", "v2": "message"}, inplace = True)

In [22]:
df.head()

Unnamed: 0,status,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
len(df)

6776

In [24]:
len(df[df.status == "spam"])

922

In [25]:
len(df[df.status == "ham"])

5854

In [26]:
df.loc[df["status"] == "ham", "status",] =1

In [27]:
df.loc[df["status"] == "spam", "status",] =0

In [28]:
df.head()

Unnamed: 0,status,message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
df_x = df["message"]
df_y = df["status"]

In [31]:
cv = CountVectorizer()

In [32]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = .2, random_state = 420)

In [33]:
x_train.head()

1277                   I plane to give on this month end.
4381                    K k :-):-) then watch some films.
4181    Yar lor... Keep raining non stop... Or u wan 2...
3126    I'll be in sch fr 4-6... I dun haf da book in ...
119     PRIVATE! Your 2004 Account Statement for 07742...
Name: message, dtype: object

In [34]:
cv = CountVectorizer()

In [35]:
x_traincv = cv.fit_transform(["Hi How are you How are you doing", "Hi What's up", "Wow that's awesome"])

In [36]:
x_traincv.toarray()

array([[2, 0, 1, 1, 2, 0, 0, 0, 0, 2],
       [0, 0, 0, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [37]:
cv.get_feature_names()

['are', 'awesome', 'doing', 'hi', 'how', 'that', 'up', 'what', 'wow', 'you']

In [38]:
cv1 = CountVectorizer()

In [39]:
x_traincv = cv1.fit_transform(x_train)

In [40]:
a = x_traincv.toarray()

In [41]:
a

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [43]:
len(a)

5420

In [42]:
a[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [45]:
cv1.inverse_transform(a[0])

[array(['end', 'give', 'month', 'on', 'plane', 'this', 'to'], dtype='<U34')]

In [46]:
x_train.iloc[0]

'I plane to give on this month end.'