<a href="https://colab.research.google.com/github/MishraShardendu22/RNN-Deep-Learning-Project-Implementation/blob/main/RNN_LSTM_GRU_Project_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install tensorflow



In [4]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense

In [14]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import kagglehub

# Download latest version of the dataset to ensure it's available
path = kagglehub.dataset_download("jcblaise/imdb-sentiments")
print("Path to dataset files:", path)

# List the contents of the downloaded directory to inspect its structure
print("Contents of downloaded directory:")
for root, dirs, files in os.walk(path):
    for name in files:
        print(os.path.join(root, name))
    for name in dirs:
        print(os.path.join(root, name))

# Load the datasets
# The dataset files are directly in the path returned by kagglehub.dataset_download
base_path = path # Corrected base_path
train_df = pd.read_csv(os.path.join(base_path, 'train.csv')) # Corrected filename
test_df = pd.read_csv(os.path.join(base_path, 'test.csv'))   # Corrected filename

# Handle potential NaN values in the 'text' columns by filling them with empty strings
train_df['text'] = train_df['text'].fillna('')
test_df['text'] = test_df['text'].fillna('')

# Combine train and test reviews for tokenizer fitting to ensure all words are covered
all_reviews = pd.concat([train_df['text'], test_df['text']], axis=0)

# Initialize tokenizer
num_words = 10000  # Consider the top 10,000 most frequent words
tokenizer = Tokenizer(num_words=num_words, oov_token='<unk>')
tokenizer.fit_on_texts(all_reviews)

# Get word index and vocabulary size
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # +1 for padding token

# Convert text to sequences
X_train_sequences = tokenizer.texts_to_sequences(train_df['text'])
X_test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Determine max sequence length (e.g., 250 words)
max_sequence_len = 250

# Pad sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_len, padding='post', truncating='post')

# Prepare labels
y_train = train_df['sentiment'].values # Changed 'label' to 'sentiment'
# y_test is not available in test_df and not used in subsequent training/validation steps

# Define embedding dimension
embedding_dim = 100

# Split training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_padded, y_train, test_size=0.2, random_state=42
)

print(f"Vocabulary Size: {vocab_size}")
print(f"Max Sequence Length: {max_sequence_len}")
print(f"Embedding Dimension: {embedding_dim}")
print(f"X_train_split shape: {X_train_split.shape}")
print(f"y_train_split shape: {y_train_split.shape}")
print(f"X_val_split shape: {X_val_split.shape}")
print(f"y_val_split shape: {y_val_split.shape}")
print(f"X_test_padded shape: {X_test_padded.shape}")
# print(f"y_test shape: {y_test.shape}") # Removed as y_test is not available

Using Colab cache for faster access to the 'imdb-sentiments' dataset.
Path to dataset files: /kaggle/input/imdb-sentiments
Contents of downloaded directory:
/kaggle/input/imdb-sentiments/valid.txt
/kaggle/input/imdb-sentiments/test.txt
/kaggle/input/imdb-sentiments/train.txt
/kaggle/input/imdb-sentiments/train.csv
/kaggle/input/imdb-sentiments/test.csv
Vocabulary Size: 105890
Max Sequence Length: 250
Embedding Dimension: 100
X_train_split shape: (20000, 250)
y_train_split shape: (20000,)
X_val_split shape: (5000, 250)
y_val_split shape: (5000,)
X_test_padded shape: (11001, 250)


In [15]:
# Build the Simple RNN model
rnn_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_len),
    SimpleRNN(128, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# Compile the Simple RNN model
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
rnn_model.summary()

# Train the Simple RNN model
rnn_history = rnn_model.fit(
    X_train_split, y_train_split,
    epochs=5,
    batch_size=32,
    validation_data=(X_val_split, y_val_split)
)
print("Simple RNN Training Complete!")

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 238ms/step - accuracy: 0.5003 - loss: 0.7031 - val_accuracy: 0.5000 - val_loss: 0.6968
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 238ms/step - accuracy: 0.5116 - loss: 0.6961 - val_accuracy: 0.5150 - val_loss: 0.6936
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 239ms/step - accuracy: 0.5134 - loss: 0.6939 - val_accuracy: 0.4904 - val_loss: 0.6978
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 237ms/step - accuracy: 0.5467 - loss: 0.6797 - val_accuracy: 0.5028 - val_loss: 0.7091
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 240ms/step - accuracy: 0.5633 - loss: 0.6534 - val_accuracy: 0.4958 - val_loss: 0.7279
Simple RNN Training Complete!


In [16]:
# Build the LSTM model
lstm_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_len),
    LSTM(128, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# Compile the LSTM model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
lstm_model.summary()

# Train the LSTM model
lstm_history = lstm_model.fit(
    X_train_split, y_train_split,
    epochs=3,
    batch_size=32,
    validation_data=(X_val_split, y_val_split)
)
print("LSTM Training Complete!")

Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 554ms/step - accuracy: 0.5088 - loss: 0.6940 - val_accuracy: 0.5116 - val_loss: 0.6928
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 546ms/step - accuracy: 0.5277 - loss: 0.6893 - val_accuracy: 0.5088 - val_loss: 0.6997
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 553ms/step - accuracy: 0.5612 - loss: 0.6608 - val_accuracy: 0.5300 - val_loss: 0.7163
LSTM Training Complete!


In [17]:
# Build the GRU model
gru_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_len),
    GRU(128, return_sequences=False),
    Dense(1, activation='sigmoid')
])

# Compile the GRU model
gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
gru_model.summary()

# Train the GRU model
gru_history = gru_model.fit(
    X_train_split, y_train_split,
    epochs=3,
    batch_size=32,
    validation_data=(X_val_split, y_val_split)
)
print("GRU Training Complete!")

Epoch 1/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 441ms/step - accuracy: 0.5142 - loss: 0.6926 - val_accuracy: 0.5046 - val_loss: 0.6934
Epoch 2/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 455ms/step - accuracy: 0.5599 - loss: 0.6781 - val_accuracy: 0.5108 - val_loss: 0.6988
Epoch 3/3
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 455ms/step - accuracy: 0.5992 - loss: 0.6154 - val_accuracy: 0.5624 - val_loss: 0.6824
GRU Training Complete!
