# Naive Bayes Classifier

# Objective:
To filter mobile phone spam using the Naive Bayes algorithm

# Dataset: sms_spam.csv
https://github.com/stedy/Machine-Learning-with-R-datasets

# 1. Load necessary libraries

In [2]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report

# 2. Load data

In [3]:
df = pd.read_csv("spamsms.csv",encoding = 'latin-1')
df.drop(["Unnamed: 2", 'Unnamed: 3', 'Unnamed: 4'], axis=1,inplace=True)
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 3. Exploring and preparing the data 

In [4]:
df.groupby('type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [41]:
#creatig a new column using type data convert into numerical data called spam
df['spam'] = df['type'].apply(lambda x: 1 if x=='spam' else 0)
df

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


# 4. Create training and test datasets

In [42]:
x_train, x_test, y_train, y_test = train_test_split(df.text,df.spam,test_size=0.30)
x_train.describe()

count                       3900
unique                      3695
top       Sorry, I'll call later
freq                          20
Name: text, dtype: object

# 5. Create indicator features for frequent words

In [46]:
cv = CountVectorizer()
x_train_count =cv.fit_transform(x_train.values)
x_train_count

<3900x7065 sparse matrix of type '<class 'numpy.int64'>'
	with 51275 stored elements in Compressed Sparse Row format>

# 6. Train a model on the data

In [47]:
model = MultinomialNB()
model.fit(x_train_count,y_train)

# 7. Evaluate model performance

In [48]:
mail_ham = ['hey how are you?']
mail_ham_count = cv.transform(mail_ham)

In [49]:
model.predict(mail_ham_count)

array([0], dtype=int64)

In [38]:
mail_spam = ['you won reward upto 10000 click below']
mail_spam_count = cv.transform(mail_spam)
model.predict(mail_spam_count)

array([1], dtype=int64)

In [50]:
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)


0.9898325358851675

# 8. Measure performance for classifcation

In [52]:
y_pred = model.predict(x_test_count)
print(confusion_matrix(y_test,y_pred))

[[1439    0]
 [  17  216]]


In [53]:
df_table = confusion_matrix(y_test,y_pred)
a = (df_table[0,0] + df_table[1,1]) / (df_table[0,0] + df_table[0,1] + df_table[1,0] + df_table[1,1])
p = df_table[1,1] / (df_table[1,1] + df_table[0,1])
r = df_table[1,1] / (df_table[1,1] + df_table[1,0])
f = (2 * p * r) / (p + r)

print("accuracy : ",round(a,2))
print("precision: ",round(p,2))
print("recall   : ",round(r,2))
print("F1 score : ",round(f,2))

accuracy :  0.99
precision:  1.0
recall   :  0.93
F1 score :  0.96
