 # ML Model - SMS SPAM DETECTION MODEL IN NLP (BASIC)
 ###### READ IN TEXT

In [1]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

#### CREATE FUNTION TO REMOVE PUNCTUATIONS, TOKENIZE, REMOVE STOPWORDS, and STEM


In [3]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split(r'\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

#### APPLY COUNTVECTORIZER 

In [12]:
import numpy as np
np.set_printoptions(threshold=np.inf)

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['body_text'])
print(X_counts.shape)
print(count_vect.get_feature_names_out())
print(X_counts)

(5567, 8104)
['' '0' '008704050406' '0089mi' '0121' '01223585236' '01223585334'
 '0125698789' '02' '020603' '0207' '02070836089' '02072069400'
 '02073162414' '02085076972' '020903' '021' '050703' '0578' '06' '060505'
 '061104' '07008009200' '07046744435' '07090201529' '07090298926'
 '07099833605' '071104' '07123456789' '0721072' '07732584351'
 '07734396839' '07742676969' '07753741225' '0776xxxxxxx' '07786200117'
 '077xxx' '078' '07801543489' '07808' '07808247860' '07808726822'
 '07815296484' '07821230901' '0784987' '0789xxxxxxx'
 '0794674629107880867867' '0796xxxxxx' '07973788240' '07xxxxxxxxx' '0800'
 '08000407165' '08000776320' '08000839402' '08000930705' '08000938767'
 '08001950382' '08002888812' '08002986030' '08002986906' '08002988890'
 '08006344447' '0808' '08081263000' '08081560665' '0825' '0844'
 '08448350055' '08448714184' '0845' '08450542832' '08452810071'
 '08452810073' '08452810075over18' '0870' '08700621170150p' '08701213186'
 '08701237397' '08701417012' '08701417012150p' 

#### SEPERATING DEPENDENT AND INDEPENDENT VARIABLE

In [17]:
X = X_counts.toarray()
y = data.iloc[:,0].values
print(y)

['spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'spam' 'ham' 'spam' 'spam' 'spam'
 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'spam' 'ham' 'spam' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'spam' 'ham' 'ham' 'spam' 'spam'
 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'spam' 'spam' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'spam' 'spam' 'ham' 'ham' 'ham' 'spam

# ENCODING THE DEPENDENT VARIABLE 

In [19]:
#To deal with categorical data we had to convert it into numbers
from sklearn.preprocessing import LabelEncoder
labelencoder_y=LabelEncoder()
#Converting First Column into Integer Values
y=labelencoder_y.fit_transform(y)
print(y)


[1 0 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 

# SPLITTING THE DATASET

In [23]:
# SPLITING THE DATASET INTO THE TRAINING SET AND TEST SET
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)


# Using Naive Bayes Classifier

In [26]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
print(classifier.get_params())


{'priors': None, 'var_smoothing': 1e-09}


# PREDICTING RESULTs


In [30]:
# PREDICTING THE TEST SET RESULTS
y_pred = classifier.predict(X_test)

# CONFUSION MATRIX 

In [32]:
# MAKING THE CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


In [33]:
cm

array([[841, 118],
       [ 12, 143]], dtype=int64)

In [34]:
(841+143)/(841+143+118+12)

0.8833034111310593