In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
valid_file_path = "valid-data.tsv"

train_data = pd.read_table(train_file_path, names=['type', 'text'])
test_data = pd.read_table(valid_file_path, names=['type', 'text'])

train_data.head()

In [None]:
#let's create a function where we can transform arrays of characters to an array with number
vocab = {}  
word_encoding = 1
def one_hot_encoding(array):
  global word_encoding
  for j in range(len(array)):
    text = array.iloc[j]
    words = text.lower().split(" ") 
    encoding = []  
    for word in words:
      if word in vocab:
        code = vocab[word]  
        encoding.append(code) 
      else:
        vocab[word] = word_encoding
        encoding.append(word_encoding)
        word_encoding += 1
    array.iloc[j] = encoding
  return array

In [None]:
#let's create the train data and valid data
train_label = train_data.pop('type')
train_label = train_label.replace(['ham','spam'],[0,1])
train_label = np.array(train_label, dtype=int)
train_data = one_hot_encoding(train_data['text'])
train_data = np.array(train_data )

test_label = test_data.pop('type')
test_label = test_label.replace(['ham','spam'],[0,1])
test_label = np.array(test_label, dtype=int )
test_data = one_hot_encoding(test_data['text'])
test_data = np.array(test_data)

In [None]:
#let's have a look of the train data
train_data, train_label

In [None]:
#padding
MAXLEN = 40
train_data = keras.utils.pad_sequences(train_data , MAXLEN)
test_data = keras.utils.pad_sequences(test_data , MAXLEN)

In [None]:
#the vocabulary_size should be big enough so the training, valid and predictions data can use the model
vocabulary_size = np.unique(train_data)[-1]+4000
vocabulary_size

In [None]:
#model
#emmbedding and LSTM to give sentimental closseness to the words
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabulary_size, 64),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation="sigmoid"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
#training
history = model.fit(train_data, train_label, epochs=6, steps_per_epoch=100)

In [None]:
#evaluate the data with the test data
results = model.evaluate(test_data, test_label)

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
#the function transform the text into numeric data, then applies the padding and make the prediction
def predict_message(text):
  text = pd.Series(text)
  text = one_hot_encoding(text)
  encoded_text = keras.utils.pad_sequences(text , MAXLEN)
  result = model.predict(encoded_text)
  if result[0][0] >= 0.5:
    return[result[0][0],'spam']
  else:
    return[result[0][0],'ham']

pred_text = "sale today! to stop texts call 98912460324"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
#Cell to know if the project passes the test
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
