In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets #comment it if you already have it installed
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
df_train = pd.read_csv(train_file_path,sep="\t",header=None)
df_test = pd.read_csv(test_file_path,sep="\t",header=None)
df_train[0]=df_train[0].replace("ham",0) 
df_train[0]=df_train[0].replace("spam",1)
df_test[0]=df_test[0].replace("ham",0)
df_test[0]=df_test[0].replace("spam",1)
df_train[0]=df_train[0].astype('int64')
df_test[0]=df_test[0].astype('int64')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((df_train[1],df_train[0]))
test_dataset = tf.data.Dataset.from_tensor_slices((df_test[1],df_test[0]))

In [None]:
tokenizer=tfds.deprecated.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in train_dataset.concatenate(test_dataset):
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)
vocab_size = len(vocabulary_set)
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [None]:
def encode(text_tensor,label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encoded_map_fn(text,label):
  encoded_text, label = tf.py_function(encode,inp=[text,label],Tout=(tf.int64, tf.int64))
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text,label
train_dataset_encoded = train_dataset.map(encoded_map_fn)
test_dataset_encoded = test_dataset.map(encoded_map_fn)

In [None]:
for train_example, train_label in train_dataset_encoded.take(2):
  print(f"Encoded Text: {train_example[:10].numpy()}")
  print(f"Label: {train_label.numpy()}")

In [None]:
BUFFER_SIZE = 1000
train_batches = (train_dataset_encoded.shuffle(BUFFER_SIZE).padded_batch(32))
test_batches = (test_dataset_encoded.padded_batch(32))

In [None]:
model = tf.keras.Sequential([
                          keras.layers.Embedding(encoder.vocab_size,16),
                          keras.layers.GlobalAveragePooling1D(),
                          keras.layers.Dense(1, activation="sigmoid")])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(train_batches, epochs=10,validation_data=test_batches,validation_steps=30)

In [None]:
def predict_message(pred_text):
  encoded_pred_text=encoder.encode(pred_text)
  encoded_pred_text=tf.cast(encoded_pred_text,tf.float32)
  prediction = model.predict(tf.expand_dims(encoded_pred_text,tf.constant(0))).tolist()
  prediction=prediction[0]
  if prediction[0]<0.5:
    prediction.append("ham")
  else:
    prediction.append("spam")
  return (prediction) 

In [None]:
print(f"Model Accuracy: {prediction[0]*100}")

In [None]:
pred_text = input("Enter SMS: ")
prediction = predict_message(pred_text)
print("Ham means genuine message, Spam means Scam!")
print(f"SMS is: {prediction[1]}")