In [2]:
# import libraries
# try:
#   # %tensorflow_version only exists in Colab.
#   !pip install tf-nightly
# except Exception:
#   pass

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from gensim.utils import tokenize

print(tf.__version__)

from IPython.display import display
import black
import jupyter_black

jupyter_black.load(lab=False, target_version=black.TargetVersion.PY310)

2.9.1


In [3]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [4]:
# Read the data into pandas data frames
# Note the special separator, it's a small work-around
train = pd.read_csv(train_file_path, sep='am\t', engine='python', names=('label', 'text'))
test = pd.read_csv(test_file_path, sep='am\t', engine='python', names=('label', 'text'))

print('Train:')
display(train.head())
print(train.shape)
display(train.isnull().sum())

print('Test:')
display(test.head())
print(test.shape)
display(test.isnull().sum())

Train:


Unnamed: 0,label,text
0,h,ahhhh...just woken up!had a bad dream about u ...
1,h,you can never do nothing
2,h,"now u sound like manky scouse boy steve,like! ..."
3,h,mum say we wan to go then go... then she can s...
4,h,never y lei... i v lazy... got wat? dat day ü ...


(4179, 2)


label    0
text     0
dtype: int64

Test:


Unnamed: 0,label,text
0,h,i am in hospital da. . i will return home in e...
1,h,"not much, just some textin'. how bout you?"
2,h,i probably won't eat at all today. i think i'm...
3,h,don‘t give a flying monkeys wot they think and...
4,h,who are you seeing?


(1392, 2)


label    0
text     0
dtype: int64

In [5]:
# Replace label strings by integers: 'ham' -> 0, 'spam' -> 1
replace_rules = {'h': 0, 'sp': 1}
train['label'] = train['label'].replace(replace_rules)
test['label'] = test['label'].replace(replace_rules)
display(train.head())
display(train.dtypes)
display(train['label'].value_counts())

Unnamed: 0,label,text
0,0,ahhhh...just woken up!had a bad dream about u ...
1,0,you can never do nothing
2,0,"now u sound like manky scouse boy steve,like! ..."
3,0,mum say we wan to go then go... then she can s...
4,0,never y lei... i v lazy... got wat? dat day ü ...


label     int64
text     object
dtype: object

0    3619
1     560
Name: label, dtype: int64

In [6]:
# Examine the distribution of the number of tokens in the train data
def num_tokens(text):
    tokens = list(tokenize(text, lower=True))
    return len(tokens)

tokenized = train['text'].apply(num_tokens)

display(tokenized.describe())

count    4179.000000
mean       15.740847
std        11.513394
min         0.000000
25%         7.000000
50%        13.000000
75%        23.000000
max       190.000000
Name: text, dtype: float64

In [7]:
# As the label classes are strongly imbalanced, apply
# the RandomOverSampler from imblearn to the train data
oversampler = RandomOverSampler(sampling_strategy=1/2, random_state=10)
train, _ = oversampler.fit_resample(train, train['label'])
display(train.shape)
display(train['label'].value_counts())

(5428, 2)

0    3619
1    1809
Name: label, dtype: int64

In [8]:
# Create a small validation data set
train, validation = train_test_split(train, test_size=.09, shuffle=True, random_state=10)

# Store data set lengths for later
train_len = len(train)
test_len = len(test)
val_len = len(validation)

In [45]:
# Transform data into tf.data.Dataset format
# Copied from TF website and adapted to personal needs:
def df_to_dataset(dataframe, shuffle=True, batch_size=128, repeat=True):
    df = dataframe.copy()
    labels = df.pop('label')
    df = df["text"] # a Series
    ds = tf.data.Dataset.from_tensor_slices((df, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    if repeat:
        ds = ds.repeat()
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

batch_size = 128
train_data = df_to_dataset(train, shuffle=False, batch_size=batch_size) # already shuffled above
test_data = df_to_dataset(test, shuffle=False, batch_size=batch_size) # does not need to be shuffled
val_data = df_to_dataset(validation, shuffle=False, batch_size=batch_size) # ditto

train_max_steps = train_len // batch_size + 1
test_max_steps = test_len // batch_size + 1
val_max_steps = val_len // batch_size + 1

In [46]:
# Init the text vectorizer
vocab_size=5000
message_length=35
vectorizer = keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=message_length
)
vectorizer.adapt(train_data.map(lambda text, label: text), steps=train_max_steps)
print(len(vectorizer.get_vocabulary()))

5000


In [47]:
# Build the model
model = keras.Sequential([
    keras.Input(shape=(1,), dtype=tf.string),
    vectorizer,
    keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=32,
        mask_zero=True,
        input_length=message_length
    ),
    keras.layers.LSTM(units=32),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(optimizer='adam', loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
model.summary()
model.evaluate(train_data, steps=train_max_steps)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 35)               0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, 35, 32)            160000    
                                                                 
 lstm_5 (LSTM)               (None, 32)                8320      
                                                                 
 dense_12 (Dense)            (None, 32)                1056      
                                                                 
 dense_13 (Dense)            (None, 1)                 33        
                                                                 
Total params: 169,409
Trainable params: 169,409
Non-trainable params: 0
________________________________________________

[0.6929824352264404, 0.517918586730957]

In [48]:
# TRAIN THE MODEL
# Early stopping callback
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', 
    min_delta=0, 
    patience=6, 
    verbose=0, 
    mode='auto', 
    baseline=None, 
    restore_best_weights=True,
)

# Fit
history = model.fit(
    x=train_data,
    epochs=20,
    steps_per_epoch=train_max_steps // 2,
    validation_data=val_data,
    validation_steps=val_max_steps,
    callbacks=[early_stopping_callback],
    use_multiprocessing=True,
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
test_text = 'I just want to eat pizza.'
test_text_input_array = np.array(test_text).reshape((-1,1))
model(test_text_input_array).numpy()

array([[0.0003281]], dtype=float32)

In [50]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
    input_array = np.array(pred_text).reshape((-1,1))
    prediction = model(input_array).numpy()[0].tolist()
    label_name = "ham" if prediction[0] < 0.5 else "spam"
    prediction.append(label_name)

    return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0.0024778672959655523, 'ham']


In [51]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
    test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

    test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
    passed = True

    for msg, ans in zip(test_messages, test_answers):
        prediction = predict_message(msg)
        if prediction[1] != ans:
            passed = False

    if passed:
        print("You passed the challenge. Great job!")
    else:
        print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
