# Classifying Movie Reviews Using RNN

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

### Loading and Preprocessing the Dataset

Load the dataset using pd.read_csv() and assign column names. </br>
Lowercase and tokenize the text using pandas string methods.</br>
Encode labels into numeric form with LabelEncoder().</br>
Split the data into training and testing sets using train_test_split().</br>
Create a vocabulary set from all unique words in the dataset.</br>
Map each unique word to a unique index.</br>
Define encode_and_pad() function to convert tokenized sentences into sequences of indices and pad them to the maximum sequence length.</br>
Process training and testing texts with encode_and_pad() to prepare data for modeling.</br>

In [2]:
df = pd.read_csv('IMDB-Dataset.csv', names=["text","label"], header=0)

df['text'] = df['text'].str.lower().str.split()

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

vocab = {word for phrase in df['text'] for word in phrase}
word_to_idx = {word: idx for idx, word in enumerate(vocab, start=1)}

max_length = df['text'].str.len().max()

def encode_and_pad(text):
    encoded = [word_to_idx[word] for word in text]
    return encoded + [0] *   (max_length - len(encoded))

train_data['text'] = train_data['text'].apply(encode_and_pad)
test_data['text'] = test_data['text'].apply(encode_and_pad)

In [3]:
df

Unnamed: 0,text,label
0,"[one, of, the, other, reviewers, has, mentione...",1
1,"[a, wonderful, little, production., <br, /><br...",1
2,"[i, thought, this, was, a, wonderful, way, to,...",1
3,"[basically, there's, a, family, where, a, litt...",0
4,"[petter, mattei's, ""love, in, the, time, of, m...",1
...,...,...
49995,"[i, thought, this, movie, did, a, down, right,...",1
49996,"[bad, plot,, bad, dialogue,, bad, acting,, idi...",0
49997,"[i, am, a, catholic, taught, in, parochial, el...",0
49998,"[i'm, going, to, have, to, disagree, with, the...",0


### Creating Dataset and Data Loader

Define a custom SentimentDataset class inheriting from PyTorch’s Dataset. </br>
Store texts and labels from input data within the class.</br>
Implement \_\_len\_\_ method to return total number of samples.</br>
Implement \_\_getitem\_\_ method to retrieve a single sample by index, converting text and label to PyTorch tensors with correct data types.</br>
Create dataset instances for training and testing data.</br>
Wrap datasets in DataLoaders with a batch size of 32.</br>
Shuffle training data in DataLoader for randomness, keep test data ordered.</br>
Prepare data for efficient batch loading during model training and evaluation.</br>

In [4]:
class SentimentDataset(Dataset):
    def __init__(self, data):
        self.texts = data['text'].values
        self.labels = data['label'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text =  self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    
train_dataset = SentimentDataset(train_data)
test_dataset = SentimentDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

### Defining the RNN Model

Define a SentimentRNN class inheriting from PyTorch’s nn.Module.</br>
Initialize an embedding layer to convert word indices into dense vectors.</br>
Add an RNN layer to process the input sequences.</br>
Include a fully connected layer to map RNN outputs to the final output size.</br>
In the forward method pass input sequences through the embedding layer.</br>
Create an initial hidden state of zeros and process the sequence using the RNN layer.</br>
Take the output from the last time step and pass it through the fully connected layer to produce predictions.</br>
Set parameters of vocabulary size, embedding size, hidden size and output size.</br>
Start the SentimentRNN model with these parameters.</br>

In [5]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:,-1,:])
        return out

vocab_size = len(vocab) + 1
embed_size = 128
hidden_size = 128
output_size = 2
model = SentimentRNN(vocab_size, embed_size, hidden_size, output_size)


### Training the Model

Define the loss function as cross-entropy loss.</br>
Set up the Adam optimizer with a learning rate of 0.001.</br>
Specify the number of training epochs.</br>
For each epoch set the model to training mode.</br>
Initialize epoch loss to zero.</br>
For each batch of texts and labels from the training loader: compute model outputs, calculate the loss and zero the optimizer gradients.</br>
Perform backpropagation by computing gradients and update model weights with the optimizer and accumulate the batch loss into epoch loss.</br>

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    print(f"\n🚀 Starting Epoch {epoch+1}/{num_epochs}", flush=True)  # 👈 Add this line
    
    model.train()
    epoch_loss = 0
    for i, (texts, labels) in enumerate(train_loader):
        if i % 50 == 0:
            print(f"Batch {i}/{len(train_loader)} loaded")

        texts, labels = texts, labels
        outputs = model(texts)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Calculate the average loss for the current epoch after the inner loop finishes
    avg_loss = epoch_loss / len(train_loader)

    # Print the average loss for the epoch
    print(f'✅ Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}', flush=True)



🚀 Starting Epoch 1/10
Batch 0/1250 loaded
