**Importing the required packages**

In [207]:
import numpy as np
import pandas as pd

**Reading Dataset File**

In [208]:
sms_dataFrame = pd.read_csv("spam.csv", encoding="ISO-8859-1")

**STUDYING THE DATASET(Data Exploration)**

Looking at the first ten rows

In [209]:
sms_dataFrame.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [210]:
sms_dataFrame.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""",GE,"GNT:-)"""
freq,4825,30,3,2,2


**Dropping the Unnamed columns 2,3 and 4 as they are of no use to us**

In [211]:
sms_dataFrame = sms_dataFrame.drop(sms_dataFrame.columns[[2,3,4]], axis=1)

In [212]:
sms_dataFrame.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


**Finding out number of ham and spam messages present in the dataset**

In [213]:
sms_dataFrame.groupby('v1').count()

Unnamed: 0_level_0,v2
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [214]:
sms_dataFrame.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


**Removing all special characters from the text messages except for a blank space**

In [215]:
sms_dataFrame['v2'] = sms_dataFrame['v2'].str.replace(r"[^a-zA-Z0-9\s]+", '')

In [216]:
sms_dataFrame.head(10)

Unnamed: 0,v1,v2
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
5,spam,FreeMsg Hey there darling its been 3 weeks now...
6,ham,Even my brother is not like to speak with me T...
7,ham,As per your request Melle Melle Oru Minnaminun...
8,spam,WINNER As a valued network customer you have b...
9,spam,Had your mobile 11 months or more U R entitled...


**Converting the dataset to lower case**

In [217]:
sms_dataFrame['v2'].str.lower()

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                   will  b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: v2, Length: 5572, dtype: object

**Encoding v1 with label encoder. Labelling ham as 0 and spam as 1**

In [218]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
sms_dataFrame['v1'] = labelEncoder.fit_transform(sms_dataFrame['v1'])  #set ham as 0 and spam as 1

In [219]:
sms_dataFrame.head(10)

Unnamed: 0,v1,v2
0,0,Go until jurong point crazy Available only in ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf he lives aroun...
5,1,FreeMsg Hey there darling its been 3 weeks now...
6,0,Even my brother is not like to speak with me T...
7,0,As per your request Melle Melle Oru Minnaminun...
8,1,WINNER As a valued network customer you have b...
9,1,Had your mobile 11 months or more U R entitled...


**Split into Training and Testing Datasets**

In [220]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(sms_dataFrame.v2 , sms_dataFrame.v1, test_size= 0.3)

**Using a count vectorizer**

In [221]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vector = vectorizer.fit_transform(X_train.values)
X_train_vector.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

**Training a Multinomial Naive Bias Classifier**

In [222]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB(alpha=1.0)
nb_model.fit(X_train_vector,y_train)

MultinomialNB()

**Testing Random data**

In [223]:
text_messages = ['Hey! I\'ll be late','Wassup!?','Reply to this message and get discount']
text_vector = vectorizer.transform(text_messages)

In [224]:
nb_model.predict(text_vector)

array([0, 0, 1])

**Finding The Accuracy**

In [225]:
X_test_vector = vectorizer.transform(X_test)

In [226]:
nb_model.score(X_test_vector, y_test)

0.9880382775119617

**Saving the Machine Learning Model to a file**

In [227]:
import pickle
pickle.dump(nb_model, open('naive_bayes_model', 'wb'))

**Saving the Vectorizer to a file**

In [228]:
pickle.dump(vectorizer, open('vectorizer', 'wb'))