Tutorial: https://www.analyticsvidhya.com/blog/2021/05/sms-spam-detection-using-lstm-a-hands-on-guide/

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import warnings
warnings.simplefilter(action='ignore', category=Warning)
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [40]:
#data_path = "merged_cleaned_preprocessed.csv"      #Google Colab (upload csv to "Dateien")
#data_path = "merged_cleaned.csv"                   #Google Colab (upload csv to "Dateien")
#data_path = "Data/merged_cleaned_preprocessed.csv" #pycharm
data_path = "Data/merged_cleaned.csv"              #pycharm

df = pd.read_csv(data_path, encoding = "latin-1")
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
df.shape

(35193, 2)

#Drop Null values:
Somehow it did not work to drop NULL values in preprocessing... Out of any reason there are some null falues in the data after reading the CSV even though they have not been there before writing the CSV file in the preprocessing skript!

In [42]:
df.isnull().sum()

label    0
text     0
dtype: int64

In [43]:
df.dropna(inplace=True)
df.isnull().sum()

label    0
text     0
dtype: int64

#Convert tokens into a numerical sequence:

In [44]:
X=df["text"]

tokenizer = Tokenizer() #initializing the tokenizer
tokenizer.fit_on_texts(X)# fitting on the sms data
text_to_sequence = tokenizer.texts_to_sequences(X) # creating the numerical sequence

In [45]:
tokenizer.index_word

{1: 'the',
 2: 'to',
 3: 'and',
 4: 'of',
 5: 'a',
 6: 'in',
 7: 'for',
 8: 'you',
 9: "'",
 10: 'is',
 11: 'this',
 12: 'i',
 13: 'on',
 14: 'enron',
 15: 'that',
 16: 's',
 17: 'be',
 18: 'with',
 19: 'your',
 20: 'we',
 21: 'as',
 22: 'have',
 23: 'from',
 24: 'ect',
 25: 'it',
 26: 'will',
 27: 'are',
 28: 'or',
 29: 'at',
 30: 'by',
 31: 'not',
 32: 'our',
 33: 'if',
 34: 'com',
 35: '1',
 36: 'all',
 37: 'company',
 38: 'please',
 39: '2',
 40: 'an',
 41: 'has',
 42: 'hou',
 43: 'can',
 44: 'me',
 45: 'any',
 46: 'was',
 47: '3',
 48: '2001',
 49: 'would',
 50: 'e',
 51: 'no',
 52: '2000',
 53: 'am',
 54: 'subject',
 55: 'my',
 56: 'new',
 57: '10',
 58: 'more',
 59: 'its',
 60: 'but',
 61: 're',
 62: 'may',
 63: '00',
 64: '5',
 65: 'do',
 66: 't',
 67: 'information',
 68: 'time',
 69: 'up',
 70: 'which',
 71: 'gas',
 72: 'about',
 73: 'been',
 74: 'one',
 75: 'they',
 76: 'out',
 77: 'energy',
 78: 'get',
 79: '4',
 80: 'us',
 81: 'business',
 82: '0',
 83: 'said',
 84: 'http',

In [46]:
len(tokenizer.index_word)

159663

We do have 135711 unique words!

In [47]:
for i in range(5):
        print("Text               : ",X[i] )
        print("Numerical Sequence : ", text_to_sequence[i])

Text               :  Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Numerical Sequence :  [175, 410, 80617, 459, 4140, 233, 104, 6, 32196, 342, 341, 314, 709, 50, 13349, 20649, 94, 563, 80618, 5663]
Text               :  Ok lar... Joking wif u oni...
Numerical Sequence :  [1074, 8559, 21780, 11705, 162, 24818]
Text               :  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
Numerical Sequence :  [161, 2184, 6, 39, 5, 26862, 5268, 2, 2007, 6429, 9091, 366, 62121, 62122, 62, 408, 980, 6429, 2, 62123, 2, 333, 2184, 951, 24819, 1051, 387, 66, 18758, 1776, 80619]
Text               :  U dun say so early hor... U c already then say...
Numerical Sequence :  [162, 7103, 538, 97, 783, 40386, 162, 151, 448, 241, 538]
Text               :  Nah I don't think he goes to usf, he lives around here though
Numerical Sequence :

#Normalization:
Since all the messages are of different lengths also the sequences are of different length, so we have to padding the sequences with 0 to get equal length. Otherwise the sequences would not be compatible for the model!

In [48]:
max_length_sequence = max([len(i) for i in text_to_sequence]) # finding the length of largest sequence
padded_sequence = pad_sequences(text_to_sequence, maxlen=max_length_sequence, padding = "pre")
padded_sequence

array([[     0,      0,      0, ...,    563,  80618,   5663],
       [     0,      0,      0, ...,  11705,    162,  24818],
       [     0,      0,      0, ...,  18758,   1776,  80619],
       ...,
       [     0,      0,      0, ...,     80, 159660, 159661],
       [     0,      0,      0, ...,    102,  10684,    883],
       [     0,      0,      0, ...,     85, 159663,    226]], dtype=int32)

In [49]:
print("shape",len(padded_sequence), "rows x ",len(padded_sequence[0]),"cols")

shape 35193 rows x  38067 cols


#Create LSTM Model:

In [50]:
TOT_SIZE = len(tokenizer.word_index)+1


def create_model():
      lstm_model = Sequential()
      lstm_model.add(Embedding(TOT_SIZE, 32, input_length=max_length_sequence))
      lstm_model.add(LSTM(100))
      lstm_model.add(Dropout(0.4))
      lstm_model.add(Dense(20, activation="relu"))
      lstm_model.add(Dropout(0.3))
      lstm_model.add(Dense(1, activation = "sigmoid"))
      return lstm_model


lstm_model = create_model()
lstm_model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 38067, 32)         5109248   
                                                                 
 lstm_1 (LSTM)               (None, 100)               53200     
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 20)                2020      
                                                                 
 dropout_3 (Dropout)         (None, 20)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 5,164,489
Trainable params: 5,164,489
No

#Train Test Split

In [51]:
from sklearn.model_selection import train_test_split
X=padded_sequence
y=df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [52]:
X_train

array([[    0,     0,     0, ...,   853, 63697,   853],
       [    0,     0,     0, ...,  2152,  1230,   853],
       [    0,     0,     0, ...,  3752,   442,   378],
       ...,
       [    0,     0,     0, ...,  1187,    66,   117],
       [    0,     0,     0, ..., 81029,  9369,  1173],
       [    0,     0,     0, ..., 69698, 69699,   916]], dtype=int32)

In [53]:
y_train

8114     0
12923    0
3978     0
19283    0
26255    1
        ..
16850    0
6265     0
11284    0
860      0
15795    1
Name: label, Length: 26394, dtype: int64

In [None]:
from keras.callbacks import EarlyStopping

lstm_model.fit(X_train ,
               y_train,
               epochs=1,
               callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

 50/825 [>.............................] - ETA: 10:31:49 - loss: 0.6671 - accuracy: 0.6006

In [None]:
scores = lstm_model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy: %.2f%%" % (scores[1]*100))

#RESULT:

Executed with google Colab:

Using the preprocessed file:    Test Accuracy: 97.40%

Using the unpreprocessed file:  Google Colabs 12GB of RAM are not enough - my machine uses 14GB but takes forever so I am running it with only 1 epoch...