## Problem Statement

Spam filtering using Naive Bayes classifier in order to predict whether a new mail based on content can be categorized as spam or ham.

Spam-- Fake messages Ham-- Good messages

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import string
import matplotlib.pyplot as plt

In [2]:
# Load the data
data=pd.read_csv("spam.tsv",sep='\t',names=['Class','Message'])

In [3]:
data

Unnamed: 0,Class,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!
...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...
5563,ham,Will ü b going to esplanade fr home?
5564,ham,"Pity, * was in mood for that. So...any other s..."
5565,ham,The guy did some bitching but I acted like i'd...


In [4]:
# create a column to keep the count of the characters present in each record
data["Length"]=data["Message"].apply(len)

In [5]:
data.head()

Unnamed: 0,Class,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [6]:
data.describe()

Unnamed: 0,Length
count,5567.0
mean,80.450153
std,59.891023
min,2.0
25%,36.0
50%,62.0
75%,122.0
max,910.0


In [7]:
# lets see the count of each class
data["Class"].value_counts()

ham     4821
spam     746
Name: Class, dtype: int64

### Text Preprocessing

In [8]:
# Lets assign ham as 1
data.loc[data["Class"]=="ham","Class"]=1

In [9]:
# Lets assign spam as 0
data.loc[data["Class"]=="spam","Class"]=0

In [10]:
data.head()

Unnamed: 0,Class,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36


In [11]:
# get the default list of punctuations in python
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
# CReating a function to remove the punctuation
def remove_punct(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text

In [13]:
text=[]
for i in data["Message"]:
    t=remove_punct(i)
    text.append(t)

In [14]:
data["Clean_Text"]=text
data.head()

Unnamed: 0,Class,Message,Length,Clean_Text
0,1,I've been searching for the right words to tha...,196,Ive been searching for the right words to than...
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
2,1,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
3,1,Even my brother is not like to speak with me. ...,77,Even my brother is not like to speak with me T...
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!!,36,I HAVE A DATE ON SUNDAY WITH WILL


In [15]:
x=data["Clean_Text"]
y=data["Class"]

In [16]:
x

0       Ive been searching for the right words to than...
1       Free entry in 2 a wkly comp to win FA Cup fina...
2       Nah I dont think he goes to usf he lives aroun...
3       Even my brother is not like to speak with me T...
4                       I HAVE A DATE ON SUNDAY WITH WILL
                              ...                        
5562    This is the 2nd time we have tried 2 contact u...
5563                  Will ü b going to esplanade fr home
5564    Pity  was in mood for that Soany other suggest...
5565    The guy did some bitching but I acted like id ...
5566                            Rofl Its true to its name
Name: Clean_Text, Length: 5567, dtype: object

In [17]:
y

0       1
1       0
2       1
3       1
4       1
       ..
5562    0
5563    1
5564    1
5565    1
5566    1
Name: Class, Length: 5567, dtype: object

In [18]:
# Datatypes for y is object. Lets convert into int
y=y.astype("int")
y

0       1
1       0
2       1
3       1
4       1
       ..
5562    0
5563    1
5564    1
5565    1
5566    1
Name: Class, Length: 5567, dtype: int32

### Splitting data into train and test

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
print(x_train.shape)
print(x_test.shape)

(4453,)
(1114,)


## Bag of Words

In [20]:
cv=CountVectorizer(stop_words="english")

In [21]:
# Apply countvectorizer functionality on the training data to convert
x_train_cv=cv.fit_transform(x_train)

### Build a model

#### BOW on Multinomial :

In [22]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(x_train_cv,y_train)

In [23]:
x_test_cv=cv.transform(x_test)

In [24]:
y_pred=model.predict(x_test_cv)

In [25]:
acc1=accuracy_score(y_test,y_pred)
print(acc1)

0.9820466786355476


In [26]:
# Classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       167
           1       0.99      0.99      0.99       947

    accuracy                           0.98      1114
   macro avg       0.97      0.95      0.96      1114
weighted avg       0.98      0.98      0.98      1114



In [27]:
## Confusion matrix
pd.crosstab(y_test,y_pred)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,153,14
1,6,941


#### BOW on Bernoulli :

In [28]:
# Bernoulli model
from sklearn.naive_bayes import BernoulliNB
model1=BernoulliNB(alpha=0.01)
model1.fit(x_train_cv,y_train)

In [29]:
x_test_cv=cv.transform(x_test)

In [30]:
y_pred=model1.predict(x_test_cv)

In [31]:
acc2=accuracy_score(y_test,y_pred)
print(acc2)

0.9856373429084381


In [32]:
# Classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95       167
           1       0.98      1.00      0.99       947

    accuracy                           0.99      1114
   macro avg       0.99      0.95      0.97      1114
weighted avg       0.99      0.99      0.99      1114



In [33]:
## Confusion matrix
pd.crosstab(y_test,y_pred)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,152,15
1,1,946


### Applying TFIDF vectorizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=CountVectorizer(stop_words="english")
tf=TfidfVectorizer()
x_train_cv=tf.fit_transform(x_train)

#### TFIDF on multinomial :

In [35]:
nb=MultinomialNB()
nb.fit(x_train_cv,y_train)

In [36]:
x_test_cv=tf.transform(x_test)

In [37]:
y_predict=nb.predict(x_test_cv)

In [38]:
acc3=accuracy_score(y_test,y_predict)
print(acc3)

0.9497307001795332


In [39]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       1.00      0.66      0.80       167
           1       0.94      1.00      0.97       947

    accuracy                           0.95      1114
   macro avg       0.97      0.83      0.88      1114
weighted avg       0.95      0.95      0.95      1114



In [40]:
pd.crosstab(y_test,y_pred)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,152,15
1,1,946


#### TFIDF on Bernoulli:

In [41]:
# Bernoulli model
from sklearn.naive_bayes import BernoulliNB
nb1=BernoulliNB(alpha=0.01)
nb1.fit(x_train_cv,y_train)

In [42]:
y_predict=nb1.predict(x_test_cv)

In [43]:
acc4=accuracy_score(y_test,y_predict)
print(acc4)

0.9847396768402155


In [44]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.99      0.90      0.95       167
           1       0.98      1.00      0.99       947

    accuracy                           0.98      1114
   macro avg       0.99      0.95      0.97      1114
weighted avg       0.98      0.98      0.98      1114



In [45]:
pd.crosstab(y_test,y_predict)

col_0,0,1
Class,Unnamed: 1_level_1,Unnamed: 2_level_1
0,151,16
1,1,946


In [46]:
pip install prettytable




In [52]:
from prettytable import PrettyTable

# Specify the Column Names while initializing the Table
myTable = PrettyTable(["Algorithms","Accuracy Score"])

myTable.add_row(["Multinomial(Bag of words)", acc1])
myTable.add_row(["bernoulli(Bag of words)", acc2])
myTable.add_row(["Multinomial(TfidfVectorizer)", acc3])
myTable.add_row(["Bernoulli(TfidfVectorizer)", acc4])
print(myTable)

+------------------------------+--------------------+
|          Algorithms          |   Accuracy Score   |
+------------------------------+--------------------+
|  Multinomial(Bag of words)   | 0.9820466786355476 |
|   bernoulli(Bag of words)    | 0.9856373429084381 |
| Multinomial(TfidfVectorizer) | 0.9497307001795332 |
|  Bernoulli(TfidfVectorizer)  | 0.9847396768402155 |
+------------------------------+--------------------+
