In [1]:
#Loading the Libraries
import pandas as pd # Pandas is used to analyze data.
import numpy as np  # NumPy is a library used for working with arrays.

In [2]:
# Reading the csv file
messages=pd.read_csv('../input/sms-spam-collection-dataset/spam.csv',encoding=('ISO-8859-1'))

In [3]:
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = pd.DataFrame( messages,columns=['v1','v2'])

In [5]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Exploratory Data Analysis


In [6]:
#Info for the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
#Finding Missing Values
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
#Rows and Columns
df.shape

(5572, 2)

In [9]:
# Target variable counts
messages['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

## Data Processing

In [10]:
#Calculating Length of Message
message_length=0
length=[]
for i in range(len(messages)):
    length.append(len(df['v2'][i]))

In [11]:
#Adding Length column to the dataframe
df['length']=length

In [12]:
df.head()

Unnamed: 0,v1,v2,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [13]:
#Calculating Punctuations in each message

import string
count=0
punct=[]
#calculating row count 
for i in range(len(df)):
    for j in df['v2'][i]:
        if j in string.punctuation:
            count+=1
    punct.append(count)
    count=0

In [14]:
df['Punctuation']=punct

In [15]:
df.head()

Unnamed: 0,v1,v2,length,Punctuation
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


## 3.1 Text Cleaning

In [16]:
#downloading Regular Expression
import re

#stopwords
from nltk.corpus import stopwords

#Lemmatization
from nltk.stem import WordNetLemmatizer

#Creating Object for Lemmatizer
lemmatizer=WordNetLemmatizer()

In [17]:
#Removal of Extra words and Stop words before Lemmatization can be done
corpus=[]

#now we will run on our dataframe 
#skipping the 0th index(its of label)
for i in range(len(df)):
    words=re.sub('[^a-zA-Z]',' ',df['v2'][i])
    words=words.lower()
    words=words.split()   #converting string to list of words
    
    #if word is stopword removing it else lemmatizing it
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    
    #Again join words to form sentences
    words = ' '.join(words)

    corpus.append (words)
   

In [18]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [19]:
#now we will replace the messages with new 'removed stopwords,lower messages'
df['v2']=corpus

In [20]:
df.head()

Unnamed: 0,v1,v2,length,Punctuation
0,ham,go jurong point crazy available bugis n great ...,111,9
1,ham,ok lar joking wif u oni,29,6
2,spam,free entry wkly comp win fa cup final tkts st ...,155,6
3,ham,u dun say early hor u c already say,49,6
4,ham,nah think go usf life around though,61,2


## 3.2 Analysing the difference between Spam and Ham messages

In [21]:
spam_messages=df[df['v1']=='spam']
ham_messages=df[df['v1']=='ham']

In [22]:
spam_messages.head()

Unnamed: 0,v1,v2,length,Punctuation
2,spam,free entry wkly comp win fa cup final tkts st ...,155,6
5,spam,freemsg hey darling week word back like fun st...,148,8
8,spam,winner valued network customer selected receiv...,158,6
9,spam,mobile month u r entitled update latest colour...,154,2
11,spam,six chance win cash pound txt csh send cost p ...,136,8


In [23]:
ham_messages.head()

Unnamed: 0,v1,v2,length,Punctuation
0,ham,go jurong point crazy available bugis n great ...,111,9
1,ham,ok lar joking wif u oni,29,6
3,ham,u dun say early hor u c already say,49,6
4,ham,nah think go usf life around though,61,2
6,ham,even brother like speak treat like aid patent,77,2


In [24]:
spam_messages['Punctuation'].mean()

5.714859437751004

In [25]:
spam_messages['length'].mean()

138.8661311914324

In [26]:
ham_messages['Punctuation'].mean()

3.9745077720207256

In [27]:
ham_messages['length'].mean()

71.02362694300518

## Model training & Building

In [28]:
X=df['v2']

In [29]:
X.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts st ...
3                  u dun say early hor u c already say
4                  nah think go usf life around though
Name: v2, dtype: object

In [30]:
y=df['v1']

In [31]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

## 4.1 Train Test Split

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=42)

## 4.2 Dealing with Text (Natural Language data) data

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [35]:
from sklearn.feature_extraction.text import CountVectorizer 

In [36]:
cv=CountVectorizer(max_features=5772)         #countvectorizer is the library which helps in creating BOW. cv is the object

In [37]:
X_train_tfidf_vect=cv.fit_transform(X_train).toarray()

In [38]:
X_train_tfidf_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [39]:
X_train_tfidf_vect.shape

(4457, 5772)

## Multinomial Naive Bayes Classifier

In [40]:
from sklearn.naive_bayes import MultinomialNB

In [41]:
from sklearn.pipeline import Pipeline


In [42]:
text_mnb=Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [43]:
text_mnb.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [44]:
y_preds_mnb=text_mnb.predict(X_test)

In [45]:
y_preds_mnb

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'spam'], dtype='<U4')

In [46]:
text_mnb.score(X_train,y_train)

0.979582678932017

In [47]:
text_mnb.score(X_test,y_test)

0.9632286995515695

## Classification Report

In [48]:
from sklearn.metrics import confusion_matrix

In [49]:
print(confusion_matrix(y_test,y_preds_mnb))

[[965   0]
 [ 41 109]]


In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_test,y_preds_mnb))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [52]:
from sklearn.metrics import accuracy_score

In [53]:
print(accuracy_score(y_test,y_preds_mnb))

0.9632286995515695
