# SMS SPAM CLASSIFIER

In [1]:
# import necessary ilbraries
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os

# pre-processing and model libraries
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



# word libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import wordcloud

In [2]:
# import and preview datasets 
data1 = pd.read_csv('spam.csv', encoding='latin-1')
data2 = pd.read_csv('spam_additional.csv')
data1.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# rename the columns to look like the data1 label
data2 = data2[['Text', 'spam']]
data2.rename(columns={'Text':'Msg','spam':'Label'},inplace=True)
data2.head()

Unnamed: 0,Msg,Label
0,"Dear Customer,\nThis is a confirmation that th...",spam
1,"Hello ADEYINKA, present your CV for interview ...",spam
2,Edgerit Limited invites you for an interview f...,spam
3,You are invited for an aptitude test with UBA ...,spam
4,"Mag consult ltd, shortlisted (you/candidate no...",spam


In [4]:
# check dataset properties
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     440 non-null    object
 1   Label   440 non-null    object
dtypes: object(2)
memory usage: 7.0+ KB


In [6]:
# drop the last three axis, rename the first two columns
data1.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
data1.rename(columns={'v1':'Label','v2':'Msg'},inplace=True)
data1.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# merge both data
data = data1.append(data2,  ignore_index=True, sort=False)
data.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6012 entries, 0 to 6011
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   6012 non-null   object
 1   Msg     6012 non-null   object
dtypes: object(2)
memory usage: 94.1+ KB


# Data Preprocessing

In [9]:
# download stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Praise\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# replace symbols, convert all texts to lowercase, split the texts(tokenize) and stemmize the words(elder brother to lammetizing)
ps = PorterStemmer() 
content = []
for i in range(0, data.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', data['Msg'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    content.append(review)

In [11]:
# create two columns, one containing cleaned message content and the other cleaned content length
data['clean_msg']=np.empty((len(content),1))
for i in range(len(content)):
    data['clean_msg'][i]=content[i]
data['clean_msg_len']=data['clean_msg'].apply(len)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Label,Msg,clean_msg,clean_msg_len
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36


In [12]:
# Perform one-hot encoding on label column(ham = 1 and spam = 0)
data=pd.concat([data, pd.get_dummies(data['Label'])], axis=1)
data.drop(['Label'],axis=1,inplace=True)
data.drop(['spam'],axis=1,inplace=True)
data.rename(columns={'ham':'label'},inplace=True)
data.head()

Unnamed: 0,Msg,clean_msg,clean_msg_len,label
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76,1
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21,1
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99,0
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35,1
4,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36,1


In [13]:
# Vectorization
# Creating X and Y: Assign clean_msg column to X, label column to Y, perform countvectorization on both X and Y
X=data['clean_msg']
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(content).toarray()

Y = data['label']
Y = np.array(Y)

# preview countvectorization of X and Y
print('X = \n', X, '\nY = \n', Y)

X = 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 
Y = 
 [1 1 0 ... 0 0 0]


# Model Training

In [14]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

# priview train and test set sizes
print("X_train shape: {}\n X_test shape: {}\nY_train shape: {}\nY_test shape: {}".format(X_train.shape,X_test.shape,y_train.shape,y_test.shape))

X_train shape: (4509, 2500)
 X_test shape: (1503, 2500)
Y_train shape: (4509,)
Y_test shape: (1503,)


In [15]:
# list for storing accuracy score of different algorithms
accuracies = []
precisions = []

**Generally for all models:**
 - Assign Model to its variable
 - Fit Model on train data
 - Test Model on test data
 - add accuracy to accuracies list
 - preview accuracy

**NAIVE BAYES CLASSIFIER** 

In [16]:
nb_model = MultinomialNB().fit(X_train, y_train)
nb_predict = nb_model.predict(X_test)
nb_accuracy = metrics.accuracy_score(y_test,nb_predict)
nb_precision = metrics.precision_score(y_test,nb_predict)
accuracies.append(nb_accuracy)
precisions.append(nb_precision)
print("Accuracy of Naive Bayes Classifier is: {}".format(nb_accuracy), "Precision of Naive Bayes Classifier is: {}".format(nb_precision))

Accuracy of Naive Bayes Classifier is: 0.9673985362608117 Precision of Naive Bayes Classifier is: 0.9822485207100592


**LOGISTIC REGRESSION**

In [17]:
lr_model=LogisticRegression(solver='liblinear')
lr_model.fit(X_train,y_train)
lr_predict = lr_model.predict(X_test)
lr_accuracy = metrics.accuracy_score(y_test,lr_predict)
lr_precision = metrics.precision_score(y_test,lr_predict)
accuracies.append(lr_accuracy)
precisions.append(lr_precision)
print("Accuracy of Logistic Rregression is: {}".format(lr_accuracy), "Precision of NLogistic Regression is: {}".format(lr_precision))

Accuracy of Logistic Rregression is: 0.9820359281437125 Precision of NLogistic Regression is: 0.9793899422918384


**SVM**

In [18]:
svm_model= SVC(kernel='rbf')
svm_model.fit(X_train, y_train) 
svm_predict = svm_model.predict(X_test)
svm_accuracy = metrics.accuracy_score(y_test,svm_predict)
svm_precision = metrics.precision_score(y_test,svm_predict)
accuracies.append(svm_accuracy)
precisions.append(svm_precision)
print("Accuracy of SVM is: {}".format(svm_accuracy), "Precision of SVM is: {}".format(svm_precision))

Accuracy of SVM is: 0.9767132401862941 Precision of SVM is: 0.9824561403508771


**RANDOM FOREST CLASSIFIER**

In [19]:
rf_model = RandomForestClassifier(n_estimators=50)
rf_model.fit(X_train,y_train)
rf_predict = rf_model.predict(X_test)
rf_accuracy = metrics.accuracy_score(y_test,rf_predict)
rf_precision = metrics.precision_score(y_test,rf_predict)
accuracies.append(rf_accuracy)
precisions.append(rf_precision)
print("Accuracy of Random Forest is: {}".format(rf_accuracy), "Precision of Random Forest is: {}".format(rf_precision))

Accuracy of Random Forest is: 0.9760479041916168 Precision of Random Forest is: 0.9776490066225165


**GRADIENT BOOSTING CLASSIFIER**

In [20]:
gb_model = GradientBoostingClassifier(random_state=100, n_estimators=150,min_samples_split=100, max_depth=6)
gb_model.fit(X_train, y_train)
gb_predict = gb_model.predict(X_test)
gb_accuracy = metrics.accuracy_score(y_test,gb_predict)
gb_precision = metrics.precision_score(y_test,gb_predict)
accuracies.append(gb_accuracy)
precisions.append(gb_precision)
print("Accuracy of Gradient Boost is: {}".format(gb_accuracy), "Precision of Gradient Boost is: {}".format(gb_precision))

Accuracy of Gradient Boost is: 0.9627411842980705 Precision of Gradient Boost is: 0.9670510708401977
