## Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
import re

In [3]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [7]:
from sklearn.pipeline import Pipeline

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix , plot_confusion_matrix
from sklearn.metrics import classification_report

In [9]:
import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [10]:
data = pd.read_csv('SMSSpamCollection' ,sep='\t' ,names=['label' ,'messages'])
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
data.shape

(5572, 2)

## Data Cleaning

In [12]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [13]:
data.isnull().sum()

label       0
messages    0
dtype: int64

In [14]:
data[data.duplicated()]

Unnamed: 0,label,messages
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls
...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...
5535,ham,"I know you are thinkin malaria. But relax, chi..."
5539,ham,Just sleeping..and surfing
5553,ham,Hahaha..use your brain dear


In [15]:
data.drop_duplicates(inplace = True)

In [16]:
data.shape

(5169, 2)

In [17]:
data['spam'] = data['label'].apply(lambda x:1   if x=='spam' else 0)
data.head()

Unnamed: 0,label,messages,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Data Processing

In [18]:
X = data['messages']
y = data['spam']

In [19]:
ps = PorterStemmer()

In [20]:
stop_word = set(stopwords.words('english'))

In [21]:
for i in range(0 ,len(X) ):
    
    #review = re.sub('[^a-zA-Z]' ,' ',X.iloc[i])
    review = X.iloc[i]
    review = review.lower()
    review = review.split()
    
    review = [ ps.stem(word)     for word in review       if not word in stop_word ]
    
    review = ' '.join(review)
    
    
    X.iloc[i] = review

In [22]:
X

0       go jurong point, crazy.. avail bugi n great wo...
1                             ok lar... joke wif u oni...
2       free entri 2 wkli comp win fa cup final tkt 21...
3               u dun say earli hor... u c alreadi say...
4                   nah think goe usf, live around though
                              ...                        
5567    2nd time tri 2 contact u. u £750 pound prize. ...
5568                             ü b go esplanad fr home?
5569             pity, * mood that. so...ani suggestions?
5570    guy bitch act like i'd interest buy someth els...
5571                                      rofl. true name
Name: messages, Length: 5169, dtype: object

In [23]:
ss_split = StratifiedShuffleSplit(
                                    n_splits = 1    ,
                                    test_size = 0.3 ,
                                    random_state = 0
                                 )

In [24]:
train_index ,test_index = next(ss_split.split(X,y))

In [25]:
train_index

array([ 440, 2996, 5014, ..., 3717,  702, 2422], dtype=int64)

In [26]:
X_train = X.iloc[train_index]
X_test  = X.iloc[test_index]

In [27]:
y_train = y.iloc[train_index]
y_test  = y.iloc[test_index]

In [28]:
y_train.value_counts(normalize=True)

0    0.873687
1    0.126313
Name: spam, dtype: float64

In [29]:
y_test.value_counts(normalize=True)

0    0.87363
1    0.12637
Name: spam, dtype: float64

#### Bag of words

In [30]:
# most frequent 5000 column
cv = CountVectorizer(max_features=5000)

In [31]:
X_train = cv.fit_transform(X_train).toarray()

In [32]:
X_test = cv.transform(X_test).toarray()

## Model 

**1. Logistic Regression**

In [33]:
lr = LogisticRegression()

In [34]:
lr.fit(X_train,y_train)

LogisticRegression()

In [35]:
lr_pred = lr.predict(X_test)

In [36]:
lr_acc = accuracy_score(y_test,lr_pred)
lr_acc

0.9819471308833011

In [37]:
confusion_matrix(y_test,lr_pred)

array([[1353,    2],
       [  26,  170]], dtype=int64)

In [38]:
print(classification_report(y_test,lr_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1355
           1       0.99      0.87      0.92       196

    accuracy                           0.98      1551
   macro avg       0.98      0.93      0.96      1551
weighted avg       0.98      0.98      0.98      1551



**2. Naive Bayes**

In [39]:
nb = MultinomialNB()

In [40]:
nb.fit(X_train ,y_train)

MultinomialNB()

In [41]:
nb_pred = nb.predict(X_test)

In [42]:
nb_acc = accuracy_score(y_test,nb_pred)
nb_acc

0.9851708575112831

In [43]:
confusion_matrix(y_test ,nb_pred)

array([[1348,    7],
       [  16,  180]], dtype=int64)

In [44]:
print(classification_report(y_test,nb_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1355
           1       0.96      0.92      0.94       196

    accuracy                           0.99      1551
   macro avg       0.98      0.96      0.97      1551
weighted avg       0.99      0.99      0.99      1551



## Summary : Models

In [45]:
models = pd.DataFrame({
                        "Model" : ['LogisticRegression' ,'MultinomialNB'] ,
                        "Accuracy" : [lr_acc ,nb_acc] 
                     }).sort_values(by='Accuracy' ,ascending=False)

models = models.reset_index(drop=True)
models

Unnamed: 0,Model,Accuracy
0,MultinomialNB,0.985171
1,LogisticRegression,0.981947


## Prediction on Data

In [46]:
email = ['Not Spam' ,'Spam']

def prediction_on_model(mail):
    mail = cv.transform(mail).toarray()
    pred = nb.predict(mail)
    
    print(f"Email is {email[pred[0]]}")

In [47]:
mail = ['Play Football tomorrow']
prediction_on_model(mail)

Email is Not Spam


In [48]:
mail = ['20% discount']
prediction_on_model(mail)

Email is Spam


In [49]:
mail = ['20% discount exclusive offer']
prediction_on_model(mail)

Email is Spam
