<a href="https://colab.research.google.com/github/GhodbaneMohammedHani/RNN-SMS-Text-Classifier/blob/main/sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Use TensorFlow 2.x in Colab
!pip install nbformat==5.9.2 nbconvert==7.14.0
%tensorflow_version 2.x
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras

# Install tensorflow-datasets if not already
!pip install -q tensorflow-datasets

print(tf.__version__)


In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
#load the data and map the labels to 0 and 1
import nlpaug.augmenter.word as naw
import nltk
"""
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')
"""
aug = naw.SynonymAug(aug_src='wordnet')

train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label','message'])
valid_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label','message'])
train_df['label'] = train_df['label'].map({'ham': 0, 'spam': 1})
valid_df['label'] = valid_df['label'].map({'ham': 0, 'spam': 1})
# Filter spam messages
spam_df = train_df[train_df['label'] == 1]
augmented_texts = []
for msg in spam_df['message']:
    for _ in range(5):  # 5x augmentation per spam
        augmented_texts.append(aug.augment(msg)[0])
aug_df = pd.DataFrame({
    'message': augmented_texts,
    'label': 1
})

# Combine and shuffle
train_df = pd.concat([train_df, aug_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
# Inspect
print(train_df['label'].value_counts())
print(train_df[train_df['label']==1].head())

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
max_len = 100
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)
tokenizer.fit_on_texts(train_df['message'])
#convert text to sequences
X_train = tokenizer.texts_to_sequences(train_df['message'])
X_valid = tokenizer.texts_to_sequences(valid_df['message'])
#pad sequences
X_train = pad_sequences(X_train,maxlen=max_len,padding='post',truncating='post')
X_valid = pad_sequences(X_valid,maxlen=max_len,padding='post',truncating='post')
y_train = train_df['label'].values
y_valid = valid_df['label'].values


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional
embedding_dim = 128
model = Sequential([
   Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.4),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.build(input_shape=(None,max_len))
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),metrics=['accuracy'])
model.summary()

In [None]:
epochs = 10
batch_size = 32
history = model.fit(X_train,y_train,epochs=epochs,validation_data=(X_valid,y_valid), batch_size=batch_size)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(epochs)
plt.figure(figsize=(8,8))
plt.subplot(1,2,1)
plt.plot(epochs_range,acc,label='Training Accuracy')
plt.plot(epochs_range,val_acc,label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.subplot(1,2,2)
plt.plot(epochs_range,loss,label='Training Loss')
plt.plot(epochs_range,val_loss,label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()


In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  prediction = []
  tokenized_text = tokenizer.texts_to_sequences([pred_text])
  padded_text = pad_sequences(tokenized_text,maxlen=max_len,padding='post',truncating='post')
  prob = model.predict(padded_text)[0][0]
  if prob > 0.5:
    prediction.append(1)
    prediction.append('spam')
  else:
    prediction.append(0)
    prediction.append('ham')
  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True
  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")
test_predictions()
