# Spam Filtering Using Naive Bayes

#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#### Load dataset

In [2]:
df = pd.read_csv("D:\\spam.csv")

#### Explore dataset

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


#### Preprocessing data

In [5]:
df['Spam'] = df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [6]:
df

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


#### Splitting dataset

In [7]:
x = df['Message']
y = df['Spam']

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=10)

In [9]:
x_train

5281               And how you will do that, princess? :)
1300    Great to hear you are settling well. So what's...
5128    Wot about on wed nite I am 3 then but only til 9!
1808      Do have a nice day today. I love you so dearly.
856     Talk sexy!! Make new friends or fall in love i...
                              ...                        
1180                  To day class is there are no class.
3441                       What time you thinkin of goin?
1344    Crazy ar he's married. Ü like gd looking guys ...
4623                They finally came to fix the ceiling.
1289                             Happy new year to u too!
Name: Message, Length: 4457, dtype: object

In [10]:
print("x_train :",len(x_train))
print("x_test :",len(x_test))
print("y_train :",len(y_train))
print("y_test :",len(y_test))

x_train : 4457
x_test : 1115
y_train : 4457
y_test : 1115


#### Feature extraction

In [11]:
# Finding word count and storing the data as a matrix

cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [12]:
x_train_count

<4457x7723 sparse matrix of type '<class 'numpy.int64'>'
	with 59346 stored elements in Compressed Sparse Row format>

#### Training the classifier

In [13]:
model = MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

In [14]:
x_test_count = cv.transform(x_test.values)

#### Making prediction

In [15]:
y_pred = model.predict(x_test_count)

In [16]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
data = pd.DataFrame({'Actual :':y_test,'Predicted :':y_pred})
data

Unnamed: 0,Actual :,Predicted :
4635,0,0
2279,0,0
4545,0,0
5084,0,0
5298,0,0
...,...,...
2357,0,0
1174,0,0
192,0,0
3606,0,0


#### Evaluating model

In [18]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [19]:
confusion_matrix(y_test,y_pred)

array([[967,   5],
       [ 14, 129]], dtype=int64)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.96      0.90      0.93       143

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [21]:
accuracy_score(y_test,y_pred)

0.9829596412556054

#### Testing with sample examples (if spam then 1 , else 0)

In [22]:
email = ["hey!! wanna meet up for studies ?"]
email_count = cv.transform(email)
model.predict(email_count)

array([0], dtype=int64)

In [23]:
email = ["click for money reward"]
email_count = cv.transform(email)
model.predict(email_count)

array([1], dtype=int64)

In [27]:
email = ["Free entry in 2 a weekly competition to win FA Cup final tkts 21st May 2024. Text FA to 87121 to receive entry question(std txt rate). T&Cs apply 08452810075over18's"]
email_count = cv.transform(email)
model.predict(email_count)

array([1], dtype=int64)

In [28]:
email = ["Don't forget to submit the project report by the end of the day."]
email_count = cv.transform(email)
model.predict(email_count)

array([0], dtype=int64)

# Our model is 98% accurate