## 1.Import Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# To prevent the error in the training section
tf.config.run_functions_eagerly(True)

## 2.Exploring Data

In [3]:
# Reading the dataset
df = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
# Check for null values
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [4]:
len(df)

5572

In [6]:
# Dropping unnamed columns
df = df[['v1', 'v2']]

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:

mask = df['v2'].str.isspace()


if mask.any():
    print(f'There are {mask.sum()} empty space strings in the dataset')


    df = df[~mask]


In [8]:
# Check for number of unique values
df.nunique()

v1       2
v2    5169
dtype: int64

In [9]:
# Checking for balancement
df['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [10]:
# Get the max length of sequences
max_seq_length = df['v2'].str.len().max()

print("Max sequence length:", max_seq_length)

Max sequence length: 910


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words='english')

X = vectorizer.fit_transform(df['v2'])

n_unique_words = len(vectorizer.get_feature_names_out())

print("Unique words:", n_unique_words)


Unique words: 8404


## 3.Preprocessing

In [12]:
# Separating X and y
X = df['v2']
y = df['v1']

display(X, y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [13]:
# Encoding the Labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

display(y)

array([0, 0, 1, ..., 0, 0, 0])

In [14]:
# Preparing the data
max_len = 150
num_words = 1000

In [15]:
# Tokenizing
tok = Tokenizer(num_words=num_words)
tok.fit_on_texts(df['v2'])

In [16]:
# Train and Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [17]:
X_train_mat = tok.texts_to_sequences(X_train)
X_test_mat = tok.texts_to_sequences(X_test)

In [18]:
# Pad sequences to the same length
X_train_padded = pad_sequences(X_train_mat, maxlen=max_len)
X_test_padded = pad_sequences(X_test_mat, maxlen=max_len)

In [19]:
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))

## 4.Modeling

In [20]:


def create_rnn_model(num_words, max_len, units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 100, input_length=max_len))
    model.add(SimpleRNN(units, activation='relu', return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(SimpleRNN(units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model

def create_lstm_model(num_words, max_len, units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 100, input_length=max_len))
    model.add(LSTM(units, activation='relu', return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model

def create_gru_model(num_words, max_len, units, dropout_rate):
    model = Sequential()
    model.add(Embedding(num_words, 100, input_length=max_len))
    model.add(GRU(units, activation='relu', return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(GRU(units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    return model


## 5.Compiling

In [24]:
# Define hyperparameters
units = 64
dropout_rate = 0.2

# Create and compile the models
# RNN
rnn_model = create_rnn_model(num_words, max_len, units, dropout_rate)
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# LSTM
lstm_model = create_lstm_model(num_words, max_len, units, dropout_rate)
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# GRU
gru_model = create_gru_model(num_words, max_len, units, dropout_rate)
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [25]:
# Train the models with early stopping
early_stopping = EarlyStopping(patience=3, restore_best_weights=True)

## 6.Training

In [26]:
# Train the models

# RNN
rnn_history = rnn_model.fit(X_train_padded, y_train, epochs=10, batch_size=128,
                            validation_split=0.2, callbacks=[early_stopping])

# LSTM
lstm_history = lstm_model.fit(X_train_padded, y_train, epochs=10, batch_size=128,
                              validation_split=0.2, callbacks=[early_stopping])

# GRU
gru_history = gru_model.fit(X_train_padded, y_train, epochs=10, batch_size=128,
                            validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


## 7.Metrics

In [27]:
rnn_val_acc = max(rnn_history.history['val_accuracy'])
lstm_val_acc = max(lstm_history.history['val_accuracy'])
gru_val_acc = max(gru_history.history['val_accuracy'])
max_val_acc = max(rnn_val_acc, lstm_val_acc, gru_val_acc)


print("RNN Validation Accuracy:", rnn_val_acc)
print("LSTM Validation Accuracy:", lstm_val_acc)
print("GRU Validation Accuracy:", gru_val_acc)
print('-' * 50)
print("Max Validation Accuracy:", max_val_acc)

RNN Validation Accuracy: 0.9820512533187866
LSTM Validation Accuracy: 0.9679487347602844
GRU Validation Accuracy: 0.985897421836853
--------------------------------------------------
Max Validation Accuracy: 0.985897421836853


## 8.Saving the Models

In [28]:
# Define a function to save the models
from tensorflow.keras.models import save_model
from datetime import datetime

def save_model(model, prefix=''):
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    filename = f"{prefix}model_{current_datetime}.h5"

    model.save(f'{filename}')
    print(f"Model saved to {filename}")