In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
cd /content/drive/MyDrive/SMS

/content/drive/MyDrive/SMS


In [6]:
!unzip smsdata.zip

Archive:  smsdata.zip
  inflating: spam.csv                


In [9]:
import pandas as pd
import numpy as np
import tensorflow
import seaborn as sns
import matplotlib.pyplot as plt
import keras
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [10]:
data=pd.read_csv('/content/drive/MyDrive/SMS/spam.csv',delimiter=',', encoding="ISO-8859-1")

In [11]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [12]:
data.shape

(5572, 5)

In [15]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [16]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [17]:
data=data.drop(columns=["Unnamed: 2","Unnamed: 3","Unnamed: 4"])

In [18]:
data=data.rename({"v1":"Category","v2":"Message"},axis=1)

In [19]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
data["Message Length"]=data["Message"].apply(len)

In [21]:
ham_desc=data[data["Category"]=="ham"]["Message Length"].describe()
spam_desc=data[data["Category"]=="spam"]["Message Length"].describe()

In [22]:
data.describe(include="all")

Unnamed: 0,Category,Message,Message Length
count,5572,5572,5572.0
unique,2,5169,
top,ham,"Sorry, I'll call later",
freq,4825,30,
mean,,,80.118808
std,,,59.690841
min,,,2.0
25%,,,36.0
50%,,,61.0
75%,,,121.0


In [24]:
minority_len=len(data[data["Category"]=="spam"])
majority_len=len(data[data["Category"]=="ham"])
minority_indices=data[data["Category"]=="spam"].index
majority_indices=data[data["Category"]=="ham"].index
random_majority_indices=np.random.choice(majority_indices,size=minority_len,replace=False)
undersampled_indices=np.concatenate([minority_indices,random_majority_indices])

In [25]:
df=data.loc[undersampled_indices]
df=df.sample(frac=1)
df=df.reset_index()
df=df.drop(columns=["index"],)

In [26]:
df["Category"].value_counts()

ham     747
spam    747
Name: Category, dtype: int64

In [27]:
df["Label"]=df["Category"].map({"ham":0,"spam":1})

In [28]:
df.head()

Unnamed: 0,Category,Message,Message Length,Label
0,ham,I (Career Tel) have added u as a contact on IN...,124,0
1,spam,You have WON a guaranteed å£1000 cash or a å£2...,147,1
2,spam,it to 80488. Your 500 free text messages are v...,74,1
3,ham,Yeah if we do have to get a random dude we nee...,124,0
4,ham,Aight do you still want to get money,36,0


In [30]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [31]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [32]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

Model Creation

In [33]:
corpus=[]
for message in df["Message"]:
  message=re.sub("[^a-zA-Z]"," ",message)
  message=message.lower()
  message=message.split()
  message=[stemmer.stem(words)
            for words in message
             if words not in set(stopwords.words("english"))
            ]
  message=" ".join(message)
  corpus.append(message)

In [34]:
from tensorflow.keras.preprocessing.text import one_hot
vocab_size=10000
oneHot_doc=[one_hot(words,n=vocab_size)for words in corpus]

In [35]:
df["Message Length"].describe()

count    1494.000000
mean      105.331995
std        57.291018
min         3.000000
25%        51.000000
50%       119.000000
75%       153.000000
max       461.000000
Name: Message Length, dtype: float64

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentence_len=200
embedded_doc=pad_sequences(oneHot_doc,maxlen=sentence_len,padding="pre")

In [37]:
extract_features=pd.DataFrame(data=embedded_doc)
target=df["Label"]
df_final=pd.concat([extract_features,target],axis=1)

In [38]:
X=df_final.drop("Label",axis=1)
y=df_final["Label"]

In [39]:
from sklearn.model_selection import train_test_split
X_trainval,X_test,y_trainval,y_test=train_test_split(X,y,random_state=42,test_size=0.15)
X_train,X_val,y_train,y_val=train_test_split(X_trainval,y_trainval,random_state=42,test_size=0.15)

In [41]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

In [42]:
model=Sequential()

In [43]:
feature_num=100
model.add(Embedding(input_dim=vocab_size,output_dim=feature_num,input_length=sentence_len))
model.add(LSTM(units=128))
model.add(Dense(units=1,activation="sigmoid"))

In [44]:
from tensorflow.keras.optimizers import Adam
model.compile(optimizer=Adam(learning_rate=0.001),loss="binary_crossentropy",metrics=["accuracy"])

In [45]:
model.fit(X_train,y_train,validation_data=(X_val,y_val),epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8813466410>

In [46]:
y_pred=model.predict(X_test)
y_pred=(y_pred>0.5)



In [47]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [48]:
score=accuracy_score(y_test,y_pred)
print("Test Score:{:.2f}%".format(score*100))

Test Score:96.00%


In [57]:
model.save('spam_classifier.h5')

In [49]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [50]:
def classify_message(model,message):
    for sentences in message:
        sentences=nltk.sent_tokenize(message)
        for sentence in sentences:
            words=re.sub("[^a-zA-Z]"," ",sentence)
            if words not in set(stopwords.words('english')):
                word=nltk.word_tokenize(words)
                word=" ".join(word)
        oneHot=[one_hot(word,n=vocab_size)]
    text=pad_sequences(oneHot,maxlen=sentence_len,padding="pre")
    predict=model.predict(text)
    
    if predict>0.5:
        print("It is a spam")
    else:
        print("It is not a spam")

In [55]:
message1="Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
message2="Thanks for your subscription to Ringtone UK your mobile will be charged å£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"

In [53]:
classify_message(model,message1)

It is not a spam


In [56]:
classify_message(model,message2)

It is a spam
