In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, Dense, Dropout #type: ignore
from sklearn.preprocessing import LabelEncoder
# import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

2024-12-14 10:31:01.482935: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734172261.498236   10083 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734172261.502995   10083 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-14 10:31:01.519070: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# get data files
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [3]:
headers = ['type', 'message']
train_ds = pd.read_csv(train_file_path, sep = '\t', names = headers)
test_ds = pd.read_csv(test_file_path, sep = '\t', names = headers)

In [4]:
words = ' '.join(train_ds['message']).split()
unique_words = len(set(words))
message_lenghts = train_ds['message'].apply(lambda x: len(x.split()))
max_length = int(message_lenghts.max())

vectorizer = TextVectorization(max_tokens = unique_words, output_mode = 'int', output_sequence_length = max_length)
vectorizer.adapt(train_ds['message'])

X_train = vectorizer(train_ds['message'])
X_test = vectorizer(test_ds['message'])

I0000 00:00:1734172263.532772   10083 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8218 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:2d:00.0, compute capability: 8.9


In [5]:
encoder = LabelEncoder()

y_train = encoder.fit_transform(train_ds['type'])
y_test = encoder.transform(test_ds['type'])

In [12]:
model = tf.keras.Sequential([Embedding(max_length, 64),
                             Bidirectional(tf.keras.layers.LSTM(64)),
                             Dense(128, activation = 'relu'),
                             Dense(16, activation = 'relu'),
                             Dropout(0.3),
                             Dense(1, activation = 'sigmoid')])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'precision', 'auc'])
model.fit(X_train, y_train, epochs = 30, validation_data=(X_test, y_test), batch_size = 16)

Epoch 1/30
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.8875 - auc: 0.7750 - loss: 0.3245 - precision: 0.5792 - val_accuracy: 0.9468 - val_auc: 0.9733 - val_loss: 0.2360 - val_precision: 0.7325
Epoch 2/30
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9697 - auc: 0.9774 - loss: 0.1070 - precision: 0.9068 - val_accuracy: 0.9648 - val_auc: 0.9779 - val_loss: 0.1054 - val_precision: 0.9259
Epoch 3/30
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.9772 - auc: 0.9795 - loss: 0.0801 - precision: 0.9272 - val_accuracy: 0.9698 - val_auc: 0.9786 - val_loss: 0.1031 - val_precision: 0.8877
Epoch 4/30
[1m262/262[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9826 - auc: 0.9857 - loss: 0.0702 - precision: 0.9608 - val_accuracy: 0.9727 - val_auc: 0.9881 - val_loss: 0.0868 - val_precision: 0.8942
Epoch 5/30
[1m262/262[0m [32m━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fdbd015ffe0>

In [None]:
message = vectorizer(["how are you doing today?"])
pred = model.predict(message)

print(pred[0][0])

In [13]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  vectorized_text = vectorizer([pred_text])
  pred = model.predict(vectorized_text)
  if pred[0][0] > 0.5:
    prediction = 'spam'
  else:
    prediction = 'ham'

  return (['', prediction])

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
['', 'ham']


In [14]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(f'pred: {prediction[1]} --- ans: {ans}')
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
pred: ham --- ans: ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
pred: spam --- ans: spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
pred: ham --- ans: ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
pred: spam --- ans: spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
pred: spam --- ans: spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
pred: ham --- ans: ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
pred: ham --- ans: ham
You passed the challenge. Great job!
