In [None]:
# Text without uppercase characters, SMS Spam dataset.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import numpy as np
import io
import os
import matplotlib.pyplot as plt

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

#Download dataset.
#(only a part of the train data of the original dataset have been taken in to consideration).

Data_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

file_path = tf.keras.utils.get_file("sms.zip", Data_URL, extract=True)

text_file = os.path.join(os.path.dirname(file_path), 'SMSSpamCollection')

In [None]:
#insert space before puntation and replace uppercase character with lowercase.
def lowercase(text):
  text=tf.strings.regex_replace(text,"([.,!?();])"," \\1 ")
  text=tf.strings.regex_replace(text,"  "," ")
  text=tf.strings.regex_replace(text,"\. \. \. \.","....")
  text=tf.strings.regex_replace(text,"\. \. \.","...")
  text=tf.strings.regex_replace(text,"\. \.","..")
  #replace uppercase charactesrs.
  text=tf.strings.lower(text)
  return text    

#The lines of the dataset can start with ham (if they are not spam) or with spam (if they are spam).
#The label associated to spam is 1, while the one associated with ham is 0.  
def process_line_lowercase(line_text):
  if tf.strings.regex_full_match(line_text,'ham.*'):
    text=tf.strings.substr(line_text,4,tf.strings.length(line_text)-4)
    text=lowercase(text) 
    return text, tf.constant(0)
  text=tf.strings.substr(line_text,5,tf.strings.length(line_text)-5)
  text=lowercase(text)  
  return text, tf.constant(1)

In [None]:
#create the dataset (composed by the couples: text-lable associated).
lines_dataset=tf.data.TextLineDataset(text_file)

# dataset with the text in lowercase.
labeled_dataset=lines_dataset.map(process_line_lowercase, num_parallel_calls=AUTOTUNE)

In [None]:
#Create the vocabulary and check its dimension.
tokenizer=tfds.features.text.Tokenizer()

vocabulary=set()
for text,_ in labeled_dataset:
  token=tokenizer.tokenize(text.numpy())
  vocabulary.update(token)

VOCABULARY_SIZE=len(vocabulary)

In [None]:
#Define the encoder.
encoder=tfds.features.text.TokenTextEncoder(vocabulary)

In [None]:
#Encode.
def encode(text, label):
  encoded_text = encoder.encode(text.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  # `tf.data.Datasets` work best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label

encoded_dataset = labeled_dataset.map(encode_map_fn)  

In [None]:
n=0
for i in encoded_dataset.as_numpy_iterator():
  n=n+1

#define the number of element in the training and in the test set.
DATASET_SIZE=n #is equal to 5574: number indicated in the dataset description.
BATCH_SIZE = 25
BATCH_NUMBER=int(round(DATASET_SIZE/BATCH_SIZE))
TEST_NUMBER=int(BATCH_NUMBER/4)

# Shuffle the messages in the dataset, and divide them in the training and in the test sets.
dataset_training_test=encoded_dataset.shuffle(DATASET_SIZE)
ds_train=dataset_training_test.take((BATCH_NUMBER-TEST_NUMBER)*BATCH_SIZE)
ds_test=dataset_training_test.skip((BATCH_NUMBER-TEST_NUMBER)*BATCH_SIZE)

# The strings must be batched and padded to the length of the longest string in the batch.
ds_train=ds_train.padded_batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
ds_test=ds_test.padded_batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

In [None]:
EMBEDDING_DIM=32

#Define the model.
model=keras.Sequential([
                        keras.layers.Embedding(VOCABULARY_SIZE+1, EMBEDDING_DIM),
                        keras.layers.Bidirectional(keras.layers.SimpleRNN(32,return_sequences=True)),
                        keras.layers.Bidirectional(keras.layers.SimpleRNN(16)),
                        keras.layers.Dense(16, activation='relu'),
                        keras.layers.Dense(16, activation='relu'),
                        keras.layers.Dropout(0.25),
                        keras.layers.Dense(1),
])

#Compile the model.
model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])
  
#Train the model.
history_original=model.fit(ds_train,validation_data=ds_test,epochs=15)           

In [None]:
plt.plot(history_original.history['accuracy'])
plt.plot(history_original.history['val_accuracy'], '')
plt.xlabel("Epoche")
plt.ylabel('Numero assegnameti corretti normalizzato')
plt.legend(['Training', 'Test'])
plt.show()

In [None]:
plt.plot(history_original.history['loss'])
plt.plot(history_original.history['val_loss'], '')
plt.xlabel("Epoche")
plt.ylabel('Errore calcolato tramite loss function')
plt.legend(['Training', 'Test'])
plt.show()