In [5]:
# %pip install ipywidgets
# %pip install datasets
# %pip install sklearn
# %pip install numpy
# %pip install torch==2.0.1
# %pip install gensim
# %pip install scikit-learn

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import gensim.downloader as api
from sklearn.metrics import accuracy_score
import numpy as np

In [7]:
# Part 0: Dataset Preparation
# Loading the Rotten Tomatoes movie review dataset
dataset = load_dataset("rotten_tomatoes")
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

In [8]:
# Data preparation function
def prepare_data(data):
    sentences = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    return sentences, labels

train_sentences, train_labels = prepare_data(train_data)
val_sentences, val_labels = prepare_data(val_data)
test_sentences, test_labels = prepare_data(test_data)

In [11]:
# Part 1: Preparing Word Embeddings
# Load pre-trained word vectors (e.g., GloVe or Word2Vec)
embedding_dim = 100  # You can choose other dimensions if available
glove_vectors = api.load("glove-wiki-gigaword-100")

# Build vocabulary from training data
def build_vocab(sentences, vectors):
    vocab = set()
    for sentence in sentences:
        for word in sentence.split():
            vocab.add(word)
    oov_words = [word for word in vocab if word not in vectors]
    return vocab, oov_words

vocab, oov_words = build_vocab(train_sentences, glove_vectors)

# Q1(a) Answer: Size of vocabulary
print("Vocabulary Size:", len(vocab))

# Q1(b) Answer: Number of OOV words
print("Number of OOV words:", len(oov_words))

# Q1(c) Handling OOV words by using random embeddings for missing words
def get_embedding_matrix(vocab, vectors, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {}
    for idx, word in enumerate(vocab):
        word_to_idx[word] = idx
        if word in vectors:  # gensim uses `in` to check if a word exists in the vocabulary
            embedding_matrix[idx] = vectors[word]
        else:
            # For OOV words, initialize random embeddings
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32), word_to_idx

embedding_matrix, word_to_idx = get_embedding_matrix(vocab, glove_vectors, embedding_dim)


Vocabulary Size: 18951
Number of OOV words: 3036


In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Define device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Part 2: Model Training & Evaluation (RNN Model)
# Define a simple RNN model
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=True
        )
        self.rnn = nn.RNN(embedding_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 2)  # 2 output classes for sentiment

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        # Using mean pooling over the sequence for sentence representation
        sentence_representation = torch.mean(output, dim=1)
        return self.fc(sentence_representation)

# Convert sentences to index sequences
def sentence_to_idx(sentence, word_to_idx):
    return [word_to_idx.get(word, 0) for word in sentence.split()]

train_indices = [sentence_to_idx(sentence, word_to_idx) for sentence in train_sentences]
val_indices = [sentence_to_idx(sentence, word_to_idx) for sentence in val_sentences]
test_indices = [sentence_to_idx(sentence, word_to_idx) for sentence in test_sentences]

# Convert data into PyTorch datasets with padding for consistent sequence length
def pad_sequences(sequences, max_len=None, padding_value=0):
    if max_len is None:
        max_len = max(len(seq) for seq in sequences)
    return [seq + [padding_value] * (max_len - len(seq)) for seq in sequences]

# Pad and convert to tensors
train_indices = pad_sequences(train_indices)
val_indices = pad_sequences(val_indices, max_len=len(train_indices[0]))  # same max_len as train
test_indices = pad_sequences(test_indices, max_len=len(train_indices[0]))

train_dataset = TensorDataset(torch.tensor(train_indices), torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_indices), torch.tensor(val_labels))
test_dataset = TensorDataset(torch.tensor(test_indices), torch.tensor(test_labels))

# DataLoader Preparation
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model initialization, loss, and optimizer
model = RNNModel(embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the RNN model
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        train_loss = 0
        for sentences, labels in train_loader:
            sentences, labels = sentences.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation accuracy after each epoch
        model.eval()  # Set model to evaluation mode
        all_preds, all_labels = [], []
        with torch.no_grad():
            for sentences, labels in val_loader:
                sentences, labels = sentences.to(device), labels.to(device)
                outputs = model(sentences)
                preds = outputs.argmax(dim=1).cpu().tolist()  # Detach and move to CPU
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().tolist())  # Detach and move to CPU

        val_acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Validation Accuracy = {val_acc * 100:.2f}%")
        model.train()  # Set model back to training mode

train(model, train_loader, val_loader, criterion, optimizer)

Epoch 1: Validation Accuracy = 65.85%
Epoch 2: Validation Accuracy = 63.41%
Epoch 3: Validation Accuracy = 70.17%
Epoch 4: Validation Accuracy = 72.42%
Epoch 5: Validation Accuracy = 71.48%
Epoch 6: Validation Accuracy = 72.70%
Epoch 7: Validation Accuracy = 73.45%
Epoch 8: Validation Accuracy = 73.92%
Epoch 9: Validation Accuracy = 73.83%
Epoch 10: Validation Accuracy = 74.02%


# Part 3: Enhancement (Using BiLSTM, BiGRU, and CNN)

In [16]:
# Define BiLSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.bilstm = nn.LSTM(embedding_dim, 64, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(64 * 2, 2)  # 2 output classes

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, _) = self.bilstm(x)
        sentence_representation = torch.mean(output, dim=1)
        return self.fc(sentence_representation)

In [17]:
class BiGRUModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(BiGRUModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.bigru = nn.GRU(embedding_dim, 64, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(64 * 2, 2)  # 2 output classes

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.bigru(x)
        sentence_representation = torch.mean(output, dim=1)
        return self.fc(sentence_representation)

In [18]:
class CNNModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.conv1 = nn.Conv2d(1, 100, (3, embedding_dim))  # 3-gram filter
        self.conv2 = nn.Conv2d(1, 100, (4, embedding_dim))  # 4-gram filter
        self.conv3 = nn.Conv2d(1, 100, (5, embedding_dim))  # 5-gram filter
        self.fc = nn.Linear(100 * 3, 2)  # 2 output classes

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # Add channel dimension
        x1 = torch.relu(self.conv1(x)).squeeze(3)
        x2 = torch.relu(self.conv2(x)).squeeze(3)
        x3 = torch.relu(self.conv3(x)).squeeze(3)

        # Apply max pooling over time
        x1 = torch.max(x1, dim=2)[0]
        x2 = torch.max(x2, dim=2)[0]
        x3 = torch.max(x3, dim=2)[0]

        x = torch.cat((x1, x2, x3), dim=1)
        return self.fc(x)

In [19]:
# Training function for different models
def train_and_evaluate(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for sentences, labels in train_loader:
            sentences, labels = sentences.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation accuracy after each epoch
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for sentences, labels in val_loader:
                sentences, labels = sentences.to(device), labels.to(device)
                outputs = model(sentences)
                preds = outputs.argmax(dim=1).cpu().tolist()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().tolist())

        val_acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Validation Accuracy = {val_acc * 100:.2f}%")

Training BiLSTM Model
Epoch 1: Validation Accuracy = 70.36%
Epoch 2: Validation Accuracy = 73.64%
Epoch 3: Validation Accuracy = 75.05%
Epoch 4: Validation Accuracy = 76.74%
Epoch 5: Validation Accuracy = 76.08%
Epoch 6: Validation Accuracy = 77.49%
Epoch 7: Validation Accuracy = 75.98%
Epoch 8: Validation Accuracy = 75.14%
Epoch 9: Validation Accuracy = 74.77%
Epoch 10: Validation Accuracy = 74.39%
Training biGRU Model
Epoch 1: Validation Accuracy = 76.83%
Epoch 2: Validation Accuracy = 76.55%
Epoch 3: Validation Accuracy = 75.98%
Epoch 4: Validation Accuracy = 75.70%
Epoch 5: Validation Accuracy = 75.52%
Epoch 6: Validation Accuracy = 75.42%
Epoch 7: Validation Accuracy = 75.14%
Epoch 8: Validation Accuracy = 75.42%
Epoch 9: Validation Accuracy = 75.52%
Epoch 10: Validation Accuracy = 75.52%
Training CNN Model
Epoch 1: Validation Accuracy = 75.33%
Epoch 2: Validation Accuracy = 75.61%
Epoch 3: Validation Accuracy = 75.14%
Epoch 4: Validation Accuracy = 75.14%
Epoch 5: Validation Accu

In [None]:
# Instantiate models
embedding_dim = 100  # Ensure this matches your embedding matrix dimensions
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)  # Ensure embedding matrix is a tensor
bilstm_model = BiLSTMModel(embedding_matrix)
bigru_model = BiGRUModel(embedding_matrix)
cnn_model = CNNModel(embedding_matrix)

# Prepare loss function and optimizers
criterion = nn.CrossEntropyLoss()
optimizer_bilstm = optim.Adam(bilstm_model.parameters(), lr=0.001)
optimizer_bigru = optim.Adam(bigru_model.parameters(), lr=0.001)
optimizer_cnn = optim.Adam(cnn_model.parameters(), lr=0.001)

# Train and evaluate each model (example for BiLSTM)
print("Training BiLSTM Model")
train_and_evaluate(bilstm_model, train_loader, val_loader, criterion, optimizer_bilstm)

print("Training biGRU Model")
train_and_evaluate(bigru_model, train_loader, val_loader, criterion, optimizer_bigru)

print("Training CNN Model")
train_and_evaluate(cnn_model, train_loader, val_loader, criterion, optimizer_cnn)

In [22]:
# Part 3(e)
# BiLSTM with Attention Model
class BiLSTMAttentionModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(BiLSTMAttentionModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.bilstm = nn.LSTM(embedding_dim, 64, bidirectional=True, batch_first=True)
        self.attention = nn.Linear(64 * 2, 1)  # Attention layer
        self.fc = nn.Linear(64 * 2, 2)  # 2 output classes for sentiment

    def forward(self, x):
        x = self.embedding(x)
        lstm_output, (hidden, _) = self.bilstm(x)  # lstm_output: [batch_size, seq_len, hidden_dim*2]

        # Attention mechanism
        attention_weights = torch.softmax(self.attention(lstm_output), dim=1)  # [batch_size, seq_len, 1]
        weighted_output = torch.sum(lstm_output * attention_weights, dim=1)  # [batch_size, hidden_dim*2]

        # Pass through fully connected layer
        return self.fc(weighted_output)

In [23]:
# Instantiate the model
embedding_matrix_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)
attention_model = BiLSTMAttentionModel(embedding_matrix_tensor).to(device)

# Prepare loss function and optimizer with specified configurations
criterion = nn.CrossEntropyLoss()
optimizer_attention = optim.Adagrad(attention_model.parameters(), lr=0.01)

# Updated DataLoader with batch_size=64
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training and evaluation function with specified 100 epochs
def train_and_evaluate_final_model(model, train_loader, val_loader, test_loader, criterion, optimizer, num_epochs=100):
    # Train the model
    for epoch in range(num_epochs):
        model.train()
        for sentences, labels in train_loader:
            sentences, labels = sentences.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation after each epoch
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for sentences, labels in val_loader:
                sentences, labels = sentences.to(device), labels.to(device)
                outputs = model(sentences)
                preds = outputs.argmax(dim=1).cpu().tolist()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().tolist())
        val_acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Validation Accuracy = {val_acc * 100:.2f}%")

    # Test set evaluation
    model.eval()
    all_test_preds, all_test_labels = [], []
    with torch.no_grad():
        for sentences, labels in test_loader:
            sentences, labels = sentences.to(device), labels.to(device)
            outputs = model(sentences)
            preds = outputs.argmax(dim=1).cpu().tolist()
            all_test_preds.extend(preds)
            all_test_labels.extend(labels.cpu().tolist())
    test_acc = accuracy_score(all_test_labels, all_test_preds)
    print(f"Test Accuracy with Final Improved Model = {test_acc * 100:.2f}%")
    return test_acc

# Train and evaluate the final model with the specified configurations
train_and_evaluate_final_model(attention_model, train_loader, val_loader, test_loader, criterion, optimizer_attention, num_epochs=100)

  embedding_matrix_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)


Epoch 1: Validation Accuracy = 76.92%
Epoch 2: Validation Accuracy = 75.98%
Epoch 3: Validation Accuracy = 76.64%
Epoch 4: Validation Accuracy = 76.36%
Epoch 5: Validation Accuracy = 75.98%
Epoch 6: Validation Accuracy = 76.08%
Epoch 7: Validation Accuracy = 76.08%
Epoch 8: Validation Accuracy = 75.89%
Epoch 9: Validation Accuracy = 75.89%
Epoch 10: Validation Accuracy = 75.89%
Epoch 11: Validation Accuracy = 75.98%
Epoch 12: Validation Accuracy = 75.98%
Epoch 13: Validation Accuracy = 75.89%
Epoch 14: Validation Accuracy = 75.80%
Epoch 15: Validation Accuracy = 75.42%
Epoch 16: Validation Accuracy = 75.52%
Epoch 17: Validation Accuracy = 75.61%
Epoch 18: Validation Accuracy = 75.52%
Epoch 19: Validation Accuracy = 75.52%
Epoch 20: Validation Accuracy = 74.77%
Epoch 21: Validation Accuracy = 75.52%
Epoch 22: Validation Accuracy = 75.61%
Epoch 23: Validation Accuracy = 75.33%
Epoch 24: Validation Accuracy = 75.05%
Epoch 25: Validation Accuracy = 75.23%
Epoch 26: Validation Accuracy = 74

0.7664165103189493