In [5]:
# %pip install ipywidgets
# %pip install datasets
# %pip install sklearn
# %pip install numpy
# %pip install torch==2.0.1
# %pip install gensim
# %pip install scikit-learn

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import gensim.downloader as api
from sklearn.metrics import accuracy_score
import numpy as np

In [7]:
# Part 0: Dataset Preparation
# Loading the Rotten Tomatoes movie review dataset
dataset = load_dataset("rotten_tomatoes")
train_data, val_data, test_data = dataset['train'], dataset['validation'], dataset['test']

In [8]:
# Data preparation function
def prepare_data(data):
    sentences = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    return sentences, labels

train_sentences, train_labels = prepare_data(train_data)
val_sentences, val_labels = prepare_data(val_data)
test_sentences, test_labels = prepare_data(test_data)

In [None]:
# Part 1: Preparing Word Embeddings
# Load pre-trained word vectors (e.g., GloVe or Word2Vec)
embedding_dim = 100  # You can choose other dimensions if available
glove_vectors = api.load("glove-wiki-gigaword-100")

# Build vocabulary from training data
def build_vocab(sentences, vectors):
    vocab = set()
    for sentence in sentences:
        for word in sentence.split():
            vocab.add(word)
    oov_words = [word for word in vocab if word not in vectors]
    return vocab, oov_words

vocab, oov_words = build_vocab(train_sentences, glove_vectors)

# Q1(a) Answer: Size of vocabulary
print("Vocabulary Size:", len(vocab))

# Q1(b) Answer: Number of OOV words
print("Number of OOV words:", len(oov_words))

# Q1(c) Handling OOV words by using random embeddings for missing words
def get_embedding_matrix(vocab, vectors, embedding_dim):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {}
    for idx, word in enumerate(vocab):
        word_to_idx[word] = idx
        if word in vectors:  # gensim uses `in` to check if a word exists in the vocabulary
            embedding_matrix[idx] = vectors[word]
        else:
            # For OOV words, initialize random embeddings
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32), word_to_idx

embedding_matrix, word_to_idx = get_embedding_matrix(vocab, glove_vectors, embedding_dim)


AttributeError: 'KeyedVectors' object has no attribute 'stoi'

In [None]:
# Part 2: Model Training & Evaluation (RNN Model)
# Define a simple RNN model
class RNNModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, freeze=True)
        self.rnn = nn.RNN(embedding_dim, 64, batch_first=True)
        self.fc = nn.Linear(64, 2)  # 2 output classes for sentiment

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        # Using mean pooling over the sequence for sentence representation
        sentence_representation = torch.mean(output, dim=1)
        return self.fc(sentence_representation)

# Convert sentences to index sequences


def sentence_to_idx(sentence, word_to_idx):
    return [word_to_idx.get(word, 0) for word in sentence.split()]


train_indices = [sentence_to_idx(sentence, word_to_idx)
                 for sentence in train_sentences]
val_indices = [sentence_to_idx(sentence, word_to_idx)
               for sentence in val_sentences]
test_indices = [sentence_to_idx(sentence, word_to_idx)
                for sentence in test_sentences]

# DataLoader Preparation
batch_size = 32

train_loader = DataLoader(
    list(zip(train_indices, train_labels)), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(list(zip(val_indices, val_labels)),
                        batch_size=batch_size, shuffle=False)
test_loader = DataLoader(list(zip(test_indices, test_labels)),
                         batch_size=batch_size, shuffle=False)

# Model initialization, loss, and optimizer
model = RNNModel(embedding_matrix)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the RNN model


def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        train_loss, val_loss = 0, 0
        for batch in train_loader:
            sentences, labels = batch
            sentences = torch.tensor(sentences, dtype=torch.long)
            labels = torch.tensor(labels, dtype=torch.long)
            optimizer.zero_grad()
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation accuracy after each epoch
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                sentences, labels = batch
                sentences = torch.tensor(sentences, dtype=torch.long)
                outputs = model(sentences)
                preds = outputs.argmax(dim=1).tolist()
                all_preds.extend(preds)
                all_labels.extend(labels)

        val_acc = accuracy_score(all_labels, all_preds)
        print(f"Epoch {epoch+1}: Validation Accuracy = {val_acc * 100:.2f}%")


train(model, train_loader, val_loader, criterion, optimizer)

In [None]:
# Part 3: Enhancement (Using BiLSTM, BiGRU, and CNN)
class BiLSTMModel(nn.Module):
    def __init__(self, embedding_matrix):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.bilstm = nn.LSTM(embedding_dim, 64, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(64 * 2, 2)  # 2 output classes

    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, _) = self.bilstm(x)
        sentence_representation = torch.mean(output, dim=1)
        return self.fc(sentence_representation)

# Instantiate and train the BiLSTM model similarly

# Implement biGRU and CNN with similar structure and report accuracy scores