**Import required libraries**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [7]:
import string

In [8]:
from tensorflow.keras.preprocessing import sequence


In [10]:
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam, RMSprop
from keras.layers import Input, Embedding, LSTM, Dense, Flatten, Dropout


In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

3. Read dataset and do pre-processing

--2022-11-01 04:12:14--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip.1’


2022-11-01 04:12:15 (780 KB/s) - ‘smsspamcollection.zip.1’ saved [203415/203415]

Archive:  /content/smsspamcollection.zip
replace SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: SMSSpamCollection       
replace readme? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [16]:
df = pd.read_csv("/content/SMSSpamCollection", sep='\t', header=None, names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
df.shape

(5572, 2)

**Read duplicate and null data**

In [19]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [20]:
df.duplicated().sum()

403

In [21]:
df = df.drop_duplicates(keep='first')
df.duplicated().sum()

0

**Normalizing the case, Removing the unwanted punctuations, Remove Stopwords**

In [22]:
ps = PorterStemmer()

**Counting Words**

In [27]:
avg_words_len=round(sum([len(i.split()) for i in df['message']])/len(df['message']))
print(avg_words_len)

15


In [28]:
s = set()
for sent in df['message']:
  for word in sent.split():
    s.add(word)
total_words_length=len(s)
print(total_words_length)

15691


**4. Create Model**

In [26]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
x = df.message
y = df.label
le = LabelEncoder()
y = le.fit_transform(y)
y = y.reshape(-1,1)

In [31]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.18, random_state=10)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4238,), (4238, 1), (931,), (931, 1))

**5. Add Layers**

In [32]:
model = Sequential()

In [33]:
tokenizer = Tokenizer(num_words = total_words_length, lower = True)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
x_train = sequence.pad_sequences(sequences, maxlen = avg_words_len)

In [34]:
model.add(Embedding(total_words_length, 50, input_length = avg_words_len))

LSTM Layer

In [35]:
model.add(LSTM(64))

Hidden Layer

In [36]:
model.add(Dense(64, activation = "relu"))

In [37]:
model.add(Flatten())

In [38]:
model.add(Dropout(0.2))

In [39]:

model.add(Dense(32, activation = "relu"))

Output Layer

In [40]:
model.add(Dense(1, activation = 'sigmoid'))

Model Summary

In [42]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 50)            784550    
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 3

**6. Compile the Model**

In [43]:
adam = Adam(learning_rate = 0.001, beta_1 = 0.85, beta_2 = 0.97, epsilon = 1e-07)
model.compile(loss = "binary_crossentropy", optimizer = adam, metrics = ["accuracy"])

**7. Fit the Model**

In [44]:
epochs=5
history = model.fit(x_train, y_train, epochs = epochs, validation_steps=0.18, batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


**8. Save the Model**

In [45]:
model.save("spam_analysis.h5")

**9. Test the Model**

In [46]:
test_sequences = tokenizer.texts_to_sequences(x_test)
x_test = sequence.pad_sequences(test_sequences, maxlen=avg_words_len)


In [47]:
accuracy = model.evaluate(x_test, y_test)



In [48]:
def predict(message):
    txt = tokenizer.texts_to_sequences(message)
    txt = sequence.pad_sequences(txt, maxlen=avg_words_len)
    pred = model.predict(txt)
    if pred>0.5:
        print("spam")
    else:
        print("Harm")

In [49]:
review1 = ["think he goes"]
predict(review1)

Harm


In [50]:
review2 = ["Go until jurong point"]
predict(review2)

Harm


In [51]:
review3 = ["WINNER!! As a valued network"]
predict(review3)

spam


In [52]:
review4 = ["URGENT! You have won a 1 week FREE membership"]
predict(review4)

spam
