In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)


In [None]:
from keras.preprocessing import sequence
import math
from keras.preprocessing.text import Tokenizer

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
#Load the data
column_names = ['status', 'text']
train_data = pd.read_csv("train-data.tsv", sep='\t', names=column_names)
test_data_raw = pd.read_csv("valid-data.tsv", sep='\t',names=column_names)

In [None]:
train_data_raw.tail()

In [None]:
#Some info about the dataset
lengths = [len(sentence) for sentence in train_data_raw['text']]
print('mean = ', np.mean(lengths), ', median = ', np.median(lengths), ', max = ', np.max(lengths))

In [None]:
MAXLEN = math.floor(2*np.median(lengths))   #Set the max length of sentence to twice the median
VOCAB_SIZE = len(lengths)

In [None]:
#Tokenize the data using keras' tokenizer-function
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data_raw['text'])
train_sequences = tokenizer.texts_to_sequences(train_data_raw['text'])
test_sequences = tokenizer.texts_to_sequences(test_data_raw['text'])

In [None]:
#Pad the data using pad_sequences
train_data = sequence.pad_sequences(train_sequences, MAXLEN)
test_data = sequence.pad_sequences(test_sequences, MAXLEN)

In [None]:
#Get a numpy array with 0 for ham and 1 for spam
mapping = {'ham': 0, 'spam': 1}
train_status = train_data_raw['status'].map(mapping).to_numpy()
test_status = test_data_raw['status'].map(mapping).to_numpy()
print(train_status)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,32),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
#Training the model
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['acc'])
history = model.fit(train_data, train_status, epochs = 10, validation_split = 0.2)

In [None]:
loss, accuracy = model.evaluate(test_data, test_status)

In [None]:
def preprocessor(text):
  text_tok = tokenizer.texts_to_sequences(text)
  return (sequence.pad_sequences(text_tok, MAXLEN))

In [None]:
print(preprocessor(['Hello World']))

In [None]:
# function to predict messages based on model
#spam means spam, ham means not spam
def predict_message(pred_text):
  pred_text_proc = preprocessor(pred_text)
  prediction_num = model.predict(pred_text_proc)
  prediction = ['spam' if value >= 0.5 else 'ham' for value in prediction_num]

  return prediction

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True
  if predict_message(test_messages) != test_answers:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
