<a href="https://colab.research.google.com/github/Gourdy09/machine_learning_certification_code/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string

from nltk.corpus import stopwords
from wordcloud import WordCloud

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import warnings
warnings.filterwarnings('ignore')

try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow_datasets as tfds
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
trdf = pd.read_csv(train_file_path, sep="\t")
tedf = pd.read_csv(test_file_path, sep="\t")

trdf.columns = ['target', 'msg']
tedf.columns = ['target', 'msg']

trdf['target'] = trdf['target'].map({'ham': 0, 'spam': 1})

In [None]:
trdf.sample(5)



In [None]:
trdf.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x='target', data=trdf)
plt.show()

In [None]:
# balance data because too many ham not enough spam
ham_msg = trdf[trdf.target == 0]
spam_msg = trdf[trdf.target == 1]
ham_msg = ham_msg.sample(n=len(spam_msg), random_state=42)

balanced_data = pd.concat([ham_msg, spam_msg]).reset_index(drop=True)
sns.countplot(data = balanced_data, x='target')

In [None]:
# clean up punctuation
def remove_punc(text : string):
  temp = str.maketrans('', '', string.punctuation)
  return text.translate(temp)

balanced_data['msg'] = balanced_data['msg'].apply(lambda x: remove_punc(x))
balanced_data.head()

In [None]:
# clean stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def remove_stopwords(text : string):
  imp_words = []

  for word in str(text).split():
    word = word.lower()
    if word not in stopwords.words('english'):
      imp_words.append(word)
  output = " ".join(imp_words)
  return output

balanced_data['msg'] = balanced_data['msg'].apply(lambda text: remove_stopwords(text))
balanced_data.head()

In [None]:
X = trdf.msg
y = trdf.target

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

train_sequences = tokenizer.texts_to_sequences(X)

train_sequences = pad_sequences(train_sequences, maxlen=50, padding='post', truncating='post')

In [None]:
# model
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                                    output_dim=32,
                                    input_length=50))
model.add(tf.keras.layers.LSTM(64))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'],
              optimizer = 'adam')
es = EarlyStopping(patience=3,
                   monitor = 'val_accuracy',
                   restore_best_weights = True)

lr = ReduceLROnPlateau(patience = 3,
                       monitor = 'val_loss',
                       factor = 0.8,
                       verbose = 0)

history = model.fit(train_sequences, y,
                    epochs=20,
                    batch_size=32,
                    callbacks = [lr, es],
                    shuffle = True
                   )

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  # Tokenize the string into a sequence of integers
  text_input = tokenizer.texts_to_sequences([pred_text])  # Wrapping pred_text in a list

# Pad the sequence to make sure it's the right length
  text_input = pad_sequences(text_input, maxlen=50, padding='post', truncating='post')

# Predict using the model
  prediction = model.predict(text_input)

  return [prediction[0][0], 'spam' if prediction >= 0.0013670842 else 'ham']

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(prediction)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
