In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2025-08-01 22:48:43--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2025-08-01 22:48:43 (19.3 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2025-08-01 22:48:43--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2025-08-01 22:48:43 (7.15 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



In [None]:
# Load data
train_df = pd.read_csv(train_file_path, sep="\t")
test_df = pd.read_csv(test_file_path, sep="\t")

In [None]:
train_df.head()

Unnamed: 0,ham,"ahhhh...just woken up!had a bad dream about u tho,so i dont like u right now :) i didnt know anything about comedy night but i guess im up for it."
0,ham,you can never do nothing
1,ham,"now u sound like manky scouse boy steve,like! ..."
2,ham,mum say we wan to go then go... then she can s...
3,ham,never y lei... i v lazy... got wat? dat day ü ...
4,ham,in xam hall boy asked girl tell me the startin...


In [None]:
# Load data with column heading
train_df = pd.read_csv(train_file_path, sep="\t", header=None, names=['label', 'message'])
test_df = pd.read_csv(test_file_path, sep="\t", header=None, names=['label', 'message'])

In [None]:
train_df.head()

Unnamed: 0,label,message
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [None]:
# Split data into inputs and labels
train_sentences = train_df['message'].tolist()
train_labels_str = train_df['label'].tolist()
test_sentences = test_df['message'].tolist()
test_labels_str = test_df['label'].tolist()

In [None]:
# Convert 'ham' to 0, 'spam' to 1
train_labels = np.array([1 if label == 'spam' else 0 for label in train_labels_str])
test_labels = np.array([1 if label == 'spam' else 0 for label in test_labels_str])

In [None]:
# Preprocess the data using tokenization and padding
vocab_size = 10000
oov_token = "<OOV>"
padding_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_sentences)

max_len = max(len(s.split()) for s in train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_len)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [None]:
# Build the model
embedding_dim = 64
dropout_rate = 0.3

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64)),
    Dropout(dropout_rate),
    Dense(32, activation='relu'),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), epochs=5, batch_size=32)


Epoch 1/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 203ms/step - accuracy: 0.8881 - loss: 0.3483 - val_accuracy: 0.9828 - val_loss: 0.0635
Epoch 2/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 206ms/step - accuracy: 0.9909 - loss: 0.0418 - val_accuracy: 0.9856 - val_loss: 0.0580
Epoch 3/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 224ms/step - accuracy: 0.9975 - loss: 0.0118 - val_accuracy: 0.9820 - val_loss: 0.0827
Epoch 4/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 188ms/step - accuracy: 0.9963 - loss: 0.0091 - val_accuracy: 0.9849 - val_loss: 0.0669
Epoch 5/5
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 191ms/step - accuracy: 0.9987 - loss: 0.0066 - val_accuracy: 0.9871 - val_loss: 0.0799


In [None]:
# Define predict_message function
def predict_message(message):
    sequence = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(sequence, maxlen=max_len, padding=padding_type, truncating=trunc_type)
    prediction = model.predict(padded)[0][0]
    label = "spam" if prediction > 0.5 else "ham"
    return [float(prediction), label]

In [None]:
# Test the model using predict_message function
messages = [
    "how are you doing today",
    "sale today! to stop texts call 98912460324",
    "i dont want to go. can we try it a different day? available sat",
    "our new mobile video service is live. just install on your phone to start watching.",
    "you have won £1000 cash! call to claim your prize.",
    "i'll bring it tomorrow. don't forget"
]

for msg in messages:
    pred = predict_message(msg)
    print(f"Message: {msg}\nPrediction: {pred}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 409ms/step
Message: how are you doing today
Prediction: [5.398636858444661e-05, 'ham']

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Message: sale today! to stop texts call 98912460324
Prediction: [0.970049262046814, 'spam']

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Message: i dont want to go. can we try it a different day? available sat
Prediction: [1.2719850019493606e-05, 'ham']

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Message: our new mobile video service is live. just install on your phone to start watching.
Prediction: [0.9997934699058533, 'spam']

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Message: you have won £1000 cash! call to claim your prize.
Prediction: [0.999940037727356, 'spam']

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Message: i'll bring it tomorrow. don't forg