In [2]:
# import libraries
import pandas as pd
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense
from tensorflow.keras.callbacks import EarlyStopping



In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-03-09 20:51:21--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.2.33, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-03-09 20:51:23 (747 KB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-03-09 20:51:23--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 172.67.70.149, 104.26.2.33, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|172.67.70.149|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-03-09 20:51:23 (20.0 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [4]:
names = ["class", "message"]

In [5]:
train_file = pd.read_csv(train_file_path, sep='\t', names=names)
train_file

Unnamed: 0,class,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...
...,...,...
4174,ham,just woke up. yeesh its late. but i didn't fal...
4175,ham,what do u reckon as need 2 arrange transport i...
4176,spam,free entry into our £250 weekly competition ju...
4177,spam,-pls stop bootydelious (32/f) is inviting you ...


In [6]:
test_file = pd.read_csv(test_file_path, sep='\t', names=names)
test_file

Unnamed: 0,class,message
0,ham,i am in hospital da. . i will return home in e...
1,ham,"not much, just some textin'. how bout you?"
2,ham,i probably won't eat at all today. i think i'm...
3,ham,don‘t give a flying monkeys wot they think and...
4,ham,who are you seeing?
...,...,...
1387,ham,true dear..i sat to pray evening and felt so.s...
1388,ham,"what will we do in the shower, baby?"
1389,ham,where are you ? what are you doing ? are yuou ...
1390,spam,ur cash-balance is currently 500 pounds - to m...


In [7]:
train_message = train_file["message"].values.tolist()
train_label = np.array([0 if x=="ham" else 1 for x in train_file['class'].values.tolist()])
test_message = test_file["message"].values.tolist()
test_label = np.array([0 if x=="ham" else 1 for x in test_file['class'].values.tolist()])

In [8]:
vocabulary_dict = {}
for message in train_message:
  for vocabulary in message.split():
    if vocabulary not in vocabulary_dict:
      vocabulary_dict[vocabulary] = 1
    else:
      vocabulary_dict[vocabulary] += 1

In [9]:
VOCAB_SIZE = len(vocabulary_dict)
MAX_LENGTH = len(max(train_message, key=lambda p: len(p.split())).split())

In [10]:
encoded_train_message = [one_hot(d, VOCAB_SIZE) for d in train_message]
padded_train_message = pad_sequences(encoded_train_message, maxlen=MAX_LENGTH, padding='post')
encoded_test_message = [one_hot(d, VOCAB_SIZE) for d in test_message]
padded_test_message = pad_sequences(encoded_test_message, maxlen=MAX_LENGTH, padding='post')

In [11]:
model = Sequential()
embedding_layer = Embedding(VOCAB_SIZE, 100, input_length=MAX_LENGTH)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
monitor = EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=25, verbose=1, mode='max', restore_best_weights=True)
model.fit(padded_train_message, train_label, validation_data=(padded_test_message, test_label), callbacks=[monitor], epochs=1000, verbose=2)

Epoch 1/1000




131/131 - 4s - 29ms/step - acc: 0.9134 - loss: 0.2306 - val_acc: 0.9734 - val_loss: 0.0954
Epoch 2/1000
131/131 - 3s - 20ms/step - acc: 0.9856 - loss: 0.0565 - val_acc: 0.9835 - val_loss: 0.0554
Epoch 3/1000
131/131 - 1s - 5ms/step - acc: 0.9916 - loss: 0.0277 - val_acc: 0.9835 - val_loss: 0.0446
Epoch 4/1000
131/131 - 1s - 5ms/step - acc: 0.9966 - loss: 0.0146 - val_acc: 0.9871 - val_loss: 0.0383
Epoch 5/1000
131/131 - 0s - 4ms/step - acc: 0.9988 - loss: 0.0080 - val_acc: 0.9871 - val_loss: 0.0402
Epoch 6/1000
131/131 - 1s - 5ms/step - acc: 0.9995 - loss: 0.0051 - val_acc: 0.9885 - val_loss: 0.0350
Epoch 7/1000
131/131 - 1s - 5ms/step - acc: 0.9998 - loss: 0.0037 - val_acc: 0.9885 - val_loss: 0.0350
Epoch 8/1000
131/131 - 0s - 4ms/step - acc: 0.9998 - loss: 0.0025 - val_acc: 0.9864 - val_loss: 0.0389
Epoch 9/1000
131/131 - 0s - 3ms/step - acc: 0.9998 - loss: 0.0023 - val_acc: 0.9871 - val_loss: 0.0394
Epoch 10/1000
131/131 - 0s - 3ms/step - acc: 0.9998 - loss: 0.0019 - val_acc: 0.9871

<keras.src.callbacks.history.History at 0x7b5d610abfd0>

In [12]:
model.save('spam_detector.keras')

In [15]:
# function to predict messages based on model
def predict_message(pred_text):
  class_dict = {
      0 : "ham",
      1 : "spam",
      }
  encoded_message = [one_hot(pred_text, VOCAB_SIZE)]
  padded_message = pad_sequences(encoded_message, maxlen=MAX_LENGTH, padding='post')
  prediction = [model.predict(padded_message)[0][0], class_dict[np.round(model.predict(padded_message)[0][0])]]
  return prediction

pred_text = "Congratulations! 🎉 You've been selected as the lucky winner of a brand-new iPhone 15 Pro! Click the link below to claim your prize now: fake-link.com 📱⚡ Hurry, offer expires in 24 hours!"
prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 179ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[0.99973553, 'spam']


In [None]:
from google.colab import files
files.download('spam_detector.keras')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>