In [None]:
## Speech and Natural Language Processing (SNLP)

## Lab03 (Part01) 
## RNNs and LSTMs

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove-6b-100-d/glove.6B.100d.txt
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torch.optim import Adam

In [14]:
# Load the IMDB dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [15]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [16]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# Preprocess text (remove HTML tags, punctuation, lowercase)
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

In [18]:
df['review'] = df['review'].apply(preprocess_text)

In [21]:
# Convert sentiment labels to binary (positive=1, negative=0)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])  # 0 for negative, 1 for positive

df.head()

Unnamed: 0,review,sentiment,label
0,one of the other reviewers has mentioned that ...,1,1
1,a wonderful little production the filming tech...,1,1
2,i thought this was a wonderful way to spend ti...,1,1
3,basically theres a family where a little boy j...,0,0
4,petter matteis love in the time of money is a ...,1,1


In [27]:
df = df.drop('label', axis=1)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [28]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [29]:
## Load GloVe embeddings

embedding_dim = 100
glove_path = '/kaggle/input/glove-6b-100-d/glove.6B.100d.txt'
embedding_index = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Tokenize the text
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

# Convert texts to sequences and pad them
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 100  # Define max length for padding
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in tokenizer.word_index.items():
    if word in embedding_index:
        embedding_matrix[idx] = embedding_index[word]

In [30]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = torch.tensor(self.reviews[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return review, label

In [31]:
# Create DataLoader for training and testing
train_dataset = IMDBDataset(X_train_pad, y_train.values)
test_dataset = IMDBDataset(X_test_pad, y_test.values)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) 

In [32]:
## Vanilla RNNs Model

class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(VanillaRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Model parameters
embedding_dim = 100
hidden_dim = 128
output_dim = 1

# Initialize model, loss function, and optimizer
RNN_Model = VanillaRNN(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(RNN_Model.parameters(), lr=0.001)

In [33]:
# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for reviews, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(reviews)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}')

# Train the Vanilla RNN model
train_model(RNN_Model, train_loader, criterion, optimizer)

Epoch 1/5, Loss: 0.6664
Epoch 2/5, Loss: 0.6616
Epoch 3/5, Loss: 0.6607
Epoch 4/5, Loss: 0.6648
Epoch 5/5, Loss: 0.6616


In [34]:
## LSTM Model

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
LSTM_Model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(LSTM_Model.parameters(), lr=0.001)

# Train the LSTM model
train_model(LSTM_Model, train_loader, criterion, optimizer, num_epochs=7)

Epoch 1/7, Loss: 0.5990
Epoch 2/7, Loss: 0.4310
Epoch 3/7, Loss: 0.3682
Epoch 4/7, Loss: 0.3296
Epoch 5/7, Loss: 0.2978
Epoch 6/7, Loss: 0.2730
Epoch 7/7, Loss: 0.2464


In [35]:
## Vanilla RNNS with on-the-fly embeddings

class VanillaRNNOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(VanillaRNNOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
RNN_withOTF_Model = VanillaRNNOnTheFly(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(RNN_withOTF_Model.parameters(), lr=0.001)

# Train the Vanilla RNN model with on-the-fly embeddings
train_model(RNN_withOTF_Model, train_loader, criterion, optimizer, num_epochs=7)

Epoch 1/7, Loss: 0.6713
Epoch 2/7, Loss: 0.6198
Epoch 3/7, Loss: 0.5754
Epoch 4/7, Loss: 0.5487
Epoch 5/7, Loss: 0.5365
Epoch 6/7, Loss: 0.5140
Epoch 7/7, Loss: 0.5366


In [36]:
## LSTM with on-the-fly embeddings

class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial hidden state
        c0 = torch.zeros(1, x.size(0), hidden_dim).to(x.device)  # Initial cell state
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use the output of the last time step
        return out

# Initialize model, loss function, and optimizer
LSTM_withOTF_Model = LSTMModelOnTheFly(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.BCEWithLogitsLoss()
optimizer = Adam(LSTM_withOTF_Model.parameters(), lr=0.001)

# Train the LSTM model with on-the-fly embeddings
train_model(LSTM_withOTF_Model, train_loader, criterion, optimizer, num_epochs=5) 

Epoch 1/5, Loss: 0.6073
Epoch 2/5, Loss: 0.4935
Epoch 3/5, Loss: 0.2962
Epoch 4/5, Loss: 0.2001
Epoch 5/5, Loss: 0.1279


In [40]:
# Function to evaluate the model
def Evaluate_Model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for reviews, labels in test_loader:
            outputs = model(reviews)
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()
            
            # Convert logits to probabilities and then to binary predictions
            predictions = torch.round(torch.sigmoid(outputs.squeeze()))
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)
    
    avg_loss = total_loss / len(test_loader)
    accuracy = correct_predictions / total_predictions
    return avg_loss, accuracy

In [47]:
# Evaluating the Vanilla RNN model with GloVe embeddings
RNN_Loss, RNN_Accuracy = Evaluate_Model(Model_RNN, test_loader, criterion)
print(f"Vanilla RNN with GloVe Embeddings - Loss: {RNN_Loss:.4f}, Accuracy: {RNN_Accuracy:.4f}")

# Evaluating the Vanilla RNN model with on-the-fly embeddings
RNN_WithOTF_Loss, RNN_WithOTF_Accuracy = Evaluate_Model(RNN_withOTF_Model, test_loader, criterion)
print(f"Vanilla RNN with On-the-Fly Embeddings - Loss: {RNN_WithOTF_Loss:.4f}, Accuracy: {RNN_WithOTF_Accuracy:.4f}") 

Vanilla RNN with GloVe Embeddings - Loss: 0.6612, Accuracy: 0.5826
Vanilla RNN with On-the-Fly Embeddings - Loss: 0.6238, Accuracy: 0.6944


In [49]:
# Evaluating the LSTM model with GloVe embeddings
LSTM_Loss, LSTM_Accuracy = Evaluate_Model(LSTM_Model, test_loader, criterion)
print(f"LSTM with GloVe Embeddings - Loss: {LSTM_Loss:.4f}, Accuracy: {LSTM_Accuracy:.4f}")

# Evaluating the LSTM model with on-the-fly embeddings
LSTM_withOTF_Loss, LSTM_withOTF_Accuracy = Evaluate_Model(LSTM_withOTF_Model, test_loader, criterion)
print(f"LSTM with On-the-Fly Embeddings - Loss: {LSTM_withOTF_Loss:.4f}, Accuracy: {LSTM_withOTF_Accuracy:.4f}")

LSTM with GloVe Embeddings - Loss: 0.3163, Accuracy: 0.8675
LSTM with On-the-Fly Embeddings - Loss: 0.3922, Accuracy: 0.8507
