## 2.

In [21]:
# Multinomial NB cares about counts for multiple features that do occur.
# MultinomialNB works with occurrence counts.


# Bernoulli NB cares about counts for a single feature that do occur and counts for the same feature that do not occur.
# BernoulliNB is designed for binary/boolean features.

## 3.

In [22]:
#  Bernouli Naive Bayes can handle missing data. Attributes are handled separately by the algorithm, at both model
# construction time and prediction time. As such, if a data instance has a missing value for an attribute, it can be ignored
# while preparing the model, and ignored when a probability is calculated for a class value.

## 4.

In [23]:
# Yes
# Gaussian Naive Bayes is a popular algorithm for classification tasks, especially for problems involving continuous numerical data.
# This blog will discuss implementing multiclass Classification using Gaussian Naive Bayes through a vectorization approach,
# which is much faster than the for-loop approach.

## 5.

In [24]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.naive_bayes import BernoulliNB 
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
#loading dataset
df = pd.read_csv('spam.csv', encoding= 'latin-1')

In [26]:
df.head(n=10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [27]:
df= df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [28]:
df.shape

(5572, 2)

In [29]:
np.unique(df['v1'])

array(['ham', 'spam'], dtype=object)

In [30]:
np.unique(df['v2'])

array([' &lt;#&gt;  in mca. But not conform.',
       ' &lt;#&gt;  mins but i had to stop somewhere first.',
       ' &lt;DECIMAL&gt; m but its not a common car here so its better to buy from china or asia. Or if i find it less expensive. I.ll holla',
       ..., 'ÌÏ thk of wat to eat tonight.', 'ÌÏ v ma fan...',
       'ÌÏ wait 4 me in sch i finish ard 5..'], dtype=object)

In [31]:
x =df["v2"].values 
y = df["v1"].values
# creating count vectorizer object 
cv = CountVectorizer()
#tranforming values 
x = cv.fit_transform(x)
v= x.toarray()
#printing sparse matrix 
print(v)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [32]:
first_col = df.pop('v2')
df.insert(0, 'v2', first_col)
df

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam
5568,Will Ì_ b going to esplanade fr home?,ham
5569,"Pity, * was in mood for that. So...any other s...",ham
5570,The guy did some bitching but I acted like i'd...,ham


In [33]:
train_x = x[:4179]
train_y = y[:4179]

test_x = x[4179:]
test_y = y[4179:]

In [34]:
bnb = BernoulliNB(binarize=0.0)
model = bnb.fit(train_x, train_y)
y_pred_train= bnb.predict(train_x)
y_pred_test = bnb.predict(test_x)

In [35]:
print(bnb.score(train_x, train_y)*100)
print(bnb.score(test_x, test_y)*100)

98.73175400813592
98.20531227566404


In [36]:
"""
We notice that we get good results on both training and testing sets. The training set gives us a score of 98.73, whereas the testing set gives us a score of 98.20. 

"""

'\nWe notice that we get good results on both training and testing sets. The training set gives us a score of 98.73, whereas the testing set gives us a score of 98.20. \n\n'

In [37]:
#for training set 
from sklearn.metrics import classification_report
print(classification_report(train_y, y_pred_train))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      3614
        spam       0.99      0.91      0.95       565

    accuracy                           0.99      4179
   macro avg       0.99      0.96      0.97      4179
weighted avg       0.99      0.99      0.99      4179



In [38]:
#for testing set 
from sklearn.metrics import classification_report
print(classification_report(test_y, y_pred_test))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1211
        spam       0.99      0.87      0.93       182

    accuracy                           0.98      1393
   macro avg       0.99      0.93      0.96      1393
weighted avg       0.98      0.98      0.98      1393

