# SMS Spam Detection Model
---

Download the dataset here: [https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset]

In [4]:
import chardet

# Read a few bytes of the file
with open("E:/Downloads/SMS Spam.csv", 'rb') as file:
    raw_data = file.read(10000)

# Detect encoding
result = chardet.detect(raw_data)
file_encoding = result['encoding']

print(f"Detected file encoding: {file_encoding}")


Detected file encoding: Windows-1252


In [5]:
import pandas as pd

df =pd.read_csv("E:/Downloads/SMS Spam.csv",encoding = file_encoding)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.shape

(5572, 5)

In [8]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [9]:
df1 = df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis = 1)
df1.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df1['spam'] = df1.v1.apply(lambda x: 1 if x == "spam" else 0)
df1.head()

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
df1[df1.spam == 0].head(10)

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
10,ham,I'm gonna be home soon and i don't want to tal...,0
13,ham,I've been searching for the right words to tha...,0
14,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,0
16,ham,Oh k...i'm watching here:),0


In [12]:
import gensim
from gensim.models import Word2Vec

In [13]:
df1['message'] = df1.v2.apply(gensim.utils.simple_preprocess)

In [14]:
df1.head()

Unnamed: 0,v1,v2,spam,message
0,ham,"Go until jurong point, crazy.. Available only ...",0,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,0,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,0,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,"[nah, don, think, he, goes, to, usf, he, lives..."


In [16]:
df1.message[14]

['have', 'date', 'on', 'sunday', 'with', 'will']

In [17]:
df2 = df1.drop(["v1","v2"],axis =1)
df2.head()

Unnamed: 0,spam,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, oni]"
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,0,"[dun, say, so, early, hor, already, then, say]"
4,0,"[nah, don, think, he, goes, to, usf, he, lives..."


In [20]:
model = Word2Vec(
    window = 4,
    min_count= 3,
    workers = 4
)

In [21]:
model.build_vocab(df2.message)

In [22]:
model.train(df2.message, total_examples=model.corpus_count, epochs = 10)

(574090, 782800)

In [23]:
if "dun" in model.wv:
    print("hi")

hi


In [24]:
import numpy as np

def get_review_vector(message, model):
    vectors = [model.wv[word] for word in message if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [27]:
df2['vector_message'] = df2.message.apply(lambda x: get_review_vector(x,model))

In [28]:
df2.head()

Unnamed: 0,spam,message,vector_message
0,0,"[go, until, jurong, point, crazy, available, o...","[-0.06339701, 0.12121549, -0.12129851, 0.05115..."
1,0,"[ok, lar, joking, wif, oni]","[-0.056828756, 0.09122117, -0.071159996, 0.058..."
2,1,"[free, entry, in, wkly, comp, to, win, fa, cup...","[-0.21736501, 0.11041727, -0.11408372, -0.0838..."
3,0,"[dun, say, so, early, hor, already, then, say]","[-0.09818489, 0.15369236, -0.073994696, 0.1809..."
4,0,"[nah, don, think, he, goes, to, usf, he, lives...","[-0.09662848, 0.14509816, -0.06806231, 0.15636..."


In [45]:
df2.vector_message[0]

array([-6.33970127e-02,  1.21215492e-01, -1.21298507e-01,  5.11584617e-02,
        6.54142797e-02, -4.03773278e-01,  1.35574833e-01,  5.61147690e-01,
       -2.39614591e-01, -6.44417256e-02, -1.03792891e-01, -4.03651267e-01,
       -5.49292378e-02,  1.59431994e-01,  1.24830246e-01, -1.01524003e-01,
        1.45602271e-01, -2.65554398e-01, -2.78180372e-02, -5.85418165e-01,
        2.15287209e-01,  4.98317555e-02,  1.45230278e-01, -1.61346301e-01,
       -1.50103360e-01,  4.72275615e-02, -3.45349520e-01, -7.80605227e-02,
       -2.32036114e-01,  9.54860374e-02,  4.29678142e-01,  3.62078361e-02,
        4.92895655e-02, -3.32836539e-01, -7.85523504e-02,  3.43754411e-01,
        1.78527266e-01, -1.12821028e-01, -1.68095827e-01, -5.46133578e-01,
       -8.63664970e-03, -2.98398018e-01, -1.29696161e-01,  1.60578772e-01,
        2.31843188e-01,  4.40040982e-04, -2.11475924e-01, -2.93242000e-02,
        2.26912841e-01,  1.70476958e-01,  5.91622666e-02, -2.14972422e-01,
       -8.88851061e-02,  

In [46]:
df2.spam.value_counts()

spam
0    4825
1     747
Name: count, dtype: int64

In [47]:
from imblearn.over_sampling import SMOTE
sampler = SMOTE(random_state = 42)

In [51]:
x = np.stack(df2.vector_message.values)
y = df2.spam

In [52]:
x.shape,y.shape

((5572, 100), (5572,))

In [54]:
x_samp,y_samp = sampler.fit_resample(x,y)

In [55]:
x_samp[0]

array([-6.33970127e-02,  1.21215492e-01, -1.21298507e-01,  5.11584617e-02,
        6.54142797e-02, -4.03773278e-01,  1.35574833e-01,  5.61147690e-01,
       -2.39614591e-01, -6.44417256e-02, -1.03792891e-01, -4.03651267e-01,
       -5.49292378e-02,  1.59431994e-01,  1.24830246e-01, -1.01524003e-01,
        1.45602271e-01, -2.65554398e-01, -2.78180372e-02, -5.85418165e-01,
        2.15287209e-01,  4.98317555e-02,  1.45230278e-01, -1.61346301e-01,
       -1.50103360e-01,  4.72275615e-02, -3.45349520e-01, -7.80605227e-02,
       -2.32036114e-01,  9.54860374e-02,  4.29678142e-01,  3.62078361e-02,
        4.92895655e-02, -3.32836539e-01, -7.85523504e-02,  3.43754411e-01,
        1.78527266e-01, -1.12821028e-01, -1.68095827e-01, -5.46133578e-01,
       -8.63664970e-03, -2.98398018e-01, -1.29696161e-01,  1.60578772e-01,
        2.31843188e-01,  4.40040982e-04, -2.11475924e-01, -2.93242000e-02,
        2.26912841e-01,  1.70476958e-01,  5.91622666e-02, -2.14972422e-01,
       -8.88851061e-02,  

In [56]:
x_samp.shape,y_samp.shape

((9650, 100), (9650,))

In [57]:
y[0]

0

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [59]:
x_train,x_test,y_train,y_test = train_test_split(x_samp,y_samp,test_size = 0.2,random_state = 42,stratify = y_samp)

In [60]:
x_train.shape,y_train.shape

((7720, 100), (7720,))

# Using Decision Tree:

In [61]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)

In [62]:
from sklearn.metrics import classification_report

In [63]:
dt_pred = dt.predict(x_test)
print(classification_report(y_test,dt_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97       965
           1       0.96      0.97      0.97       965

    accuracy                           0.97      1930
   macro avg       0.97      0.97      0.97      1930
weighted avg       0.97      0.97      0.97      1930



# Using KNN:

In [72]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)

In [73]:
knn_pred = knn.predict(x_test)
print(classification_report(y_test,knn_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97       965
           1       0.95      1.00      0.98       965

    accuracy                           0.97      1930
   macro avg       0.98      0.97      0.97      1930
weighted avg       0.98      0.97      0.97      1930



# Using Random Forest:

In [74]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
rf = RandomForestClassifier(n_estimators=30)
rf.fit(x_train,y_train)

In [81]:
rf_pred = rf.predict(x_test)
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.99      0.99      0.99       965

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



# Using Logistic Regression:

In [78]:
from sklearn.linear_model import LogisticRegression

In [79]:
lg = LogisticRegression()
lg.fit(x_train,y_train)

In [80]:
lg_pred = lg.predict(x_test)
print(classification_report(y_test,lg_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       965
           1       0.94      0.92      0.93       965

    accuracy                           0.93      1930
   macro avg       0.93      0.93      0.93      1930
weighted avg       0.93      0.93      0.93      1930



# Clearly we can notice that, Random Forest Classifier out-performs the other models
---

# Predicting Output...

In [86]:
x[2]

array([-0.21736501,  0.11041727, -0.11408372, -0.08388928,  0.18595135,
       -0.58185953,  0.06382256,  0.66244829, -0.43932655, -0.11457279,
       -0.01341448, -0.37863654, -0.08937325,  0.08016786, -0.03664821,
       -0.2719681 ,  0.18873158, -0.49271825,  0.13345884, -0.67629057,
        0.11404082,  0.12075476, -0.13296176, -0.30322263,  0.04605274,
        0.20617606, -0.3466152 ,  0.15435708, -0.19058426, -0.02486001,
        0.27073762,  0.17129189,  0.07762408, -0.3004671 , -0.0434324 ,
        0.41938648,  0.01128063, -0.60650486, -0.01356026, -0.49773774,
       -0.04118476, -0.39504951, -0.40899503, -0.20313631,  0.2881591 ,
       -0.06267825, -0.38332352, -0.14642926, -0.06550975,  0.20823826,
        0.06503708, -0.16680822, -0.06198554,  0.34699544, -0.28452256,
        0.34974027, -0.05015042, -0.12618119, -0.04510152,  0.43246955,
        0.21938038, -0.05496861,  0.15055811, -0.05332213, -0.40355229,
        0.35722271,  0.17106892,  0.28771612, -0.33010057,  0.49

In [89]:
if rf.predict(x[[2]])==1:
    print("Spam")
else:
    print('Not Spam')

Spam


In [102]:
print(df.v2[2],"-"*10,">",df.v1[2])

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's ---------- > spam
