In [78]:
# three types of naive bayes classifier
# 1) Bernoulli Naive Bayes
# 2) Multinominal Naive Bayes
# 3) Gaussian Naive Bayes

In [79]:
# Bernoulli Naive Bayes : It assumes that all feautures are binary such that they take only two values. Means 0s can represent "word does not occur in the total column" and 1s as "wors occurs in the document"

In [80]:
# Multinominal Naive Bayes : It is used when we have discrete data (e.g. movie ratings ranging 1 and 5 as each rating will have cartain frequency to represent). In text learning we have the count of each word to predict the class or label

In [81]:
# Gaussian Naive Bayes : It assumes that the features follow a Gaussian (normal) distribution. It is commonly used for continous or real-values features
# let's say we have a dataset of people's heights and weights, and we want to predict whether a person is tall or short

In [82]:
# if a column contains a sentence in the same record, it is typically used for Multinominal Naive Bayes and Bernoulli Naive Bayes

In [83]:
# which one is used above two is decided by type of problem
# if the problem is like a discrete data then it is for Multinominal Naive Bayes
# if the problem is like a review with positive or negative words, binary features then it is suited for Bernoulli Naive Bayes

In [84]:
# for both multinominal and bernoulli naive bayes use same Countvectorizer, but how can we show difference?
# Ans) The Countvectorizer can be used to convert text data into a feature matrix.
#      Multinominal Naive Bayes: In this it considers the frequency of words in each record. It creates a feature matrix where each entry represents the count of a word in a record
#      Bernoulli Naive Bayes: In this it considers the presence or absence of words in each record. It creates a feature matrix where each entry represents whether a word is present(1) or absent(0) in a record

In [85]:
import pandas as pd

In [86]:
df=pd.read_csv("C:/Users/Manikanta/Machine learning/CSV Files/spam.csv")

In [87]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [88]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [89]:
df.groupby("Category").describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [90]:
df["spam"]=df["Category"].apply(lambda x:1 if x=="spam" else 0)

In [91]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [92]:
from sklearn.model_selection import train_test_split

In [93]:
x_train,x_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.3)

In [94]:
len(x_train)

3900

In [95]:
len(x_test)

1672

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
v=CountVectorizer()
x_train_count=v.fit_transform(x_train)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [98]:
from sklearn.naive_bayes import MultinomialNB

In [99]:
model=MultinomialNB()

In [100]:
model.fit(x_train_count,y_train)

MultinomialNB()

In [101]:
x_test

1754       Jus came back fr lunch wif my sis only. U leh?
58                                 Tell where you reached
1187     Goodmorning, Today i am late for  &lt;#&gt; min.
3052    Awesome question with a cute answer: Someone a...
3620    8007 25p 4 Alfie Moon's Children in Need song ...
                              ...                        
283                         Ok. I asked for money how far
5235    Am on the uworld site. Am i buying the qbank o...
1681                                   Okay... We wait ah
3989    Hello. Sort of out in town already. That . So ...
784     You have an important customer service announc...
Name: Message, Length: 1672, dtype: object

In [106]:
x_test_count=v.transform(x_test)
x_test_count.toarray()[:3]


# A dimension mismatch error occurs when the number of features in the test data does not match the number of features in the training data.
# This can happen if you use fit_transform on both the training and test data, because this method will create a new vocabulary for each data set.
# To avoid this error, you should use fit_transform only on the training data, and then use transform on the test data.
# This way, the test data will use the same vocabulary as the training data, and the dimensions will be consistent.



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [107]:
model.predict(x_test_count)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [108]:
model.score(x_test_count,y_test)

0.9904306220095693

##### pipeline

In [109]:
from sklearn.pipeline import Pipeline

In [110]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

In [111]:
clf=Pipeline(steps=[
    ('v',CountVectorizer()),
    ('model',MultinomialNB())
    #('scalar',StandardScaler())
    #('regressor',Ridge(aplha=0.5))
])

In [112]:
clf.fit(x_train,y_train)

Pipeline(steps=[('v', CountVectorizer()), ('model', MultinomialNB())])

In [113]:
clf.predict(x_test)

array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [114]:
clf.score(x_test,y_test)

0.9904306220095693