In [126]:
import numpy as np
import pandas as pd
from keras import Sequential
from keras.src.callbacks import ModelCheckpoint
from keras.src.layers import Embedding, LSTM, GlobalMaxPooling1D, Dense
from keras.src.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import plotly.express as px

In [95]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['labels', "data"]
df

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [96]:
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
Y = df['b_labels'].values

In [97]:
train_df, test_df, train_labels, test_labels = train_test_split(df['data'], Y, test_size=0.33)

In [98]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_df)
train_sequences = tokenizer.texts_to_sequences(train_df)
test_sequences = tokenizer.texts_to_sequences(test_df)
test_sequences

[[31, 169, 117, 12, 3, 161, 792, 22, 589],
 [209, 268, 123, 19, 209, 3106, 15, 19, 135, 1, 665, 591, 15],
 [24,
  31,
  22,
  4,
  3368,
  7,
  1,
  2220,
  118,
  129,
  199,
  14,
  2121,
  500,
  1398,
  31,
  219,
  22,
  4,
  3368],
 [94, 405, 30, 3, 415, 160, 10, 27, 5, 22, 6347, 15, 135, 33, 149, 22, 5, 144],
 [10, 1407, 1402, 3, 3285, 494, 11, 1565],
 [3, 16, 103, 6401, 203, 6507, 887, 412, 377],
 [35, 1, 95, 71, 2, 416, 175],
 [132, 5188, 6, 51, 413, 160, 43, 371, 27],
 [5692,
  107,
  28,
  10,
  4,
  5693,
  107,
  1147,
  11,
  409,
  1281,
  2,
  1853,
  107,
  872,
  818,
  15,
  82,
  9,
  112,
  409,
  5694],
 [1, 114, 13, 4791, 6381, 1133],
 [430, 3, 355, 3, 18, 23, 440],
 [6368, 1583, 2904, 971, 58, 630, 29, 49],
 [5766,
  440,
  1117,
  5767,
  62,
  714,
  14,
  1635,
  73,
  714,
  2,
  5768,
  95,
  186,
  53,
  57,
  214,
  313,
  4,
  3226,
  3227,
  43,
  134,
  616,
  133,
  3228,
  223,
  195,
  5769,
  871,
  313],
 [244, 2, 6],
 [27, 85, 181],
 [16, 3, 2422

In [99]:
word2idx = tokenizer.word_index
V = len(word2idx)
print(f"Found {V} unique words")

Found 7249 unique words


In [100]:
train_data = pad_sequences(train_sequences)
print(f"Shape of train data: {train_data.shape}")

T = train_data.shape[1]

Shape of train data: (3733, 121)


In [101]:
test_data = pad_sequences(test_sequences, maxlen=T)
print(f"Shape of test data: {test_data.shape}")

Shape of test data: (1839, 121)


In [103]:
D = 20

model = Sequential()
model.add(Embedding(V + 1, D, input_shape=(T,)))
model.add(LSTM(units=15, return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

mc = ModelCheckpoint(filepath="best_model.keras", monitor="val_loss", mode="min", save_best_only=True, verbose=1)

r = model.fit(train_data, train_labels, epochs=20, validation_data=(test_data, test_labels), callbacks=[mc])

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.35192, saving model to best_model.keras
Epoch 2/20
Epoch 2: val_loss improved from 0.35192 to 0.27615, saving model to best_model.keras
Epoch 3/20
Epoch 3: val_loss improved from 0.27615 to 0.20461, saving model to best_model.keras
Epoch 4/20
Epoch 4: val_loss improved from 0.20461 to 0.14602, saving model to best_model.keras
Epoch 5/20
Epoch 5: val_loss improved from 0.14602 to 0.10719, saving model to best_model.keras
Epoch 6/20
Epoch 6: val_loss improved from 0.10719 to 0.09020, saving model to best_model.keras
Epoch 7/20
Epoch 7: val_loss improved from 0.09020 to 0.07883, saving model to best_model.keras
Epoch 8/20
Epoch 8: val_loss improved from 0.07883 to 0.07861, saving model to best_model.keras
Epoch 9/20
Epoch 9: val_loss improved from 0.07861 to 0.07554, saving model to best_model.keras
Epoch 10/20
Epoch 10: val_loss improved from 0.07554 to 0.06201, saving model to best_model.keras
Epoch 11/20
Epoch 11: val_loss did not imp

In [104]:
def draw_history(history):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['accuracy'], name='accuracy'))
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['val_accuracy'], name='val_accuracy'))
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['loss'], name='loss'))
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['val_loss'], name='val_loss'))
    fig.show()


draw_history(r)

In [133]:
# model.load_weights("best_model.keras")
data = np.concatenate([train_data, test_data])
labels = np.concatenate([train_labels, test_labels])

preds = model.predict(data)

res = pd.DataFrame({
    "true": labels,
    "pred": preds.round().ravel()
})

print(f"Accuracy: {accuracy_score(res.true, res.pred)}")

cm = confusion_matrix(res.true, res.pred)
px.imshow(cm, text_auto=True, y=("t_true", "t_spam"), x=("p_true", "p_spam"))

Accuracy: 0.994256999282125
