# SMS Spam Detection

In [248]:
#Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [249]:
#Import the dataset
spamham = pd.read_csv(r'G:\ML\Datasets\spam.csv',encoding='latin-1')

In [250]:
#Dropping the Unnamed columns
spamham = spamham.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

In [251]:
#Renaming columns v1 and v2 for identification
spamham = spamham.rename(columns={"v1": "labels", "v2": "messages"})

In [252]:
spamham.head(5)

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [253]:
#Check the dimensions of the dataset
spamham.shape

(5572, 2)

In [254]:
spamham.size

11144

In [255]:
#Checking datatypes
spamham.dtypes

labels      object
messages    object
dtype: object

In [256]:
spamham.columns

Index(['labels', 'messages'], dtype='object')

In [257]:
#Checking the dataset
spamham.groupby('labels').describe()

Unnamed: 0_level_0,messages,messages,messages,messages
Unnamed: 0_level_1,count,unique,top,freq
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [258]:
#Checking for null values
spamham.isnull().sum()

labels      0
messages    0
dtype: int64

1. From the dataset it can be seen that there are no null values. 
2. There are 5572 rows and 2 columns.
3. No Spam messages are 4825 in count.
4. Spam messages are 747 in count.
5. Features are of type object ie, string.
6. There are two features "labels" and "messages"

In [259]:
#Adding extra column length to the dataset
spamham['length'] = spamham['messages'].apply(len)

In [260]:
spamham.groupby('labels').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4825.0,71.023627,58.016023,2.0,33.0,52.0,92.0,910.0
spam,747.0,138.866131,29.183082,13.0,132.5,149.0,157.0,224.0


# Text Preprocessing

Most of the classiication problems we are dealing with use algorithms which need numerical data for analysis. In our case here we cannot proceed with the existing categorical data for model building and prediction.We need to convert the string into vector form before using it for building model.
ML terms this is know as "feature extraction". I will use one of the feature extraction technique called "Bag of Words".


Bag of Words technique helps us "Design the Vocabulary" and then create a "Vector Matrix" which can be furthur used for model building and prediction.

Before we go ahead with feature extraction we need to preprocess the text. That is we will remove the unnecessary puntuactions and stop words which helps in better and fast execution of the model.

In [261]:
#function to remove unnecessary punctuations,stop words,converting text into same text format
def text_processing(m):
    l = m.lower()
    nopunc = [c for c in l if c not in string.punctuation]
    nopunc=''.join(nopunc)
    #Tokenise using built in NLTK library word tokenization, that converts the string of words into list of words
    w = word_tokenize(nopunc)
    #removal of stop words
    cm = [m for m in w if m not in stopwords.words('english')]
    return cm

In [262]:
spamham['messages'].head(5).apply(text_processing)

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: messages, dtype: object

# Vectorization

Vectorization is a very important part of "feature extraction", once that we design the vocabulary than the text(tokens) is converted into a vector matrix. In vectorization the count of number of times the word(token) appear in the sentence is calculated and then the importance of that word(token) is decided based on the IDF(Inverse Document Frequency) value. Here the frequently appearing word is weighed less.

I will use CountVectorizer for Vocab design and conversion into matrix of token count and TFIDF Vectorizer for deciding the significance of each token by weighing them.


Before moving forward I need to split the dataset into Test and Train sets. 

# Split the Data into Train and Test

In [263]:
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(spamham['messages'],spamham['labels'],test_size=0.2)

Now that we have two datasets one is test and another is train we need to implement the text preprocessing and vectorization steps atleat twice. Its bit time consuming, instead we will use python inbuilt package "Pipeline" for doing multiple activities in go including under same function. Using costume functions in another options but it is quite time consuming and code length increases. So out of the available resources I am using the best one. I am implementing "Pipeline" package.


We will be using the Naive Bayes classifier model for prediccting the messages. 
Then we will evaluate how well the model has performed using python's in built confusion matrix package. 

In [264]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
   ( 'bow',CountVectorizer(analyzer=text_processing)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB()),
])

In [265]:
#Training the model. 
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_processing at 0x0000016664582488>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproce...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [266]:
#Making the prediction after training the model.
predictions = pipeline.predict(msg_test)

In [267]:
#Evaluating the model used by using confusion matrix and generating report using classification report packages
print(classification_report(predictions,label_test))

             precision    recall  f1-score   support

        ham       1.00      0.95      0.98       993
       spam       0.73      1.00      0.84       122

avg / total       0.97      0.96      0.96      1115



So the classifier used gives considerably good result with recall and precision values 
being more then 95%. Probablity of Spam being detected as no spam is quite less. 