In [1]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
BATCH_SIZE = 16
NUM_EPOCHS = 3

# 1. Data Preparation
dataset = load_dataset("imdb")
tokenizer_bert = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, for_bert=True):
        self.data = data
        self.tokenizer = tokenizer
        self.for_bert = for_bert

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]

        if self.for_bert:
            encoding = self.tokenizer(
                text,
                max_length=MAX_LEN,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            return {
                "input_ids": encoding["input_ids"].flatten(),
                "attention_mask": encoding["attention_mask"].flatten(),
                "label": torch.tensor(label, dtype=torch.long)
            }
        else:
            encoding = self.tokenizer(
                text,
                max_length=MAX_LEN,
                truncation=True,
                add_special_tokens=False,
                return_tensors="pt"
            )
            return encoding["input_ids"].flatten(), torch.tensor(label)

In [4]:
class BertSentiment(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("google-bert/bert-base-uncased")
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]
        return self.classifier(pooled)

In [5]:
class GRUSentiment(nn.Module):
    def __init__(self, vocab_size=30522, embedding_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(
            input_size=embedding_dim,
            hidden_size=128,
            num_layers=2,
            bidirectional=True,
            dropout=0.3,
            batch_first=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        return self.classifier(output[:, -1, :])

In [6]:
def train_model(model, train_loader, val_loader, model_type="bert"):
    model.to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=2e-5) if model_type == "bert" else \
        torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(NUM_EPOCHS):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()

            if model_type == "bert":
                input_ids = batch["input_ids"].to(DEVICE)
                attention_mask = batch["attention_mask"].to(DEVICE)
                outputs = model(input_ids, attention_mask)
            else:
                inputs = batch[0].to(DEVICE)
                outputs = model(inputs)

            labels = batch["label"].to(DEVICE) if model_type == "bert" else batch[1].to(DEVICE)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for batch in val_loader:
                if model_type == "bert":
                    input_ids = batch["input_ids"].to(DEVICE)
                    attention_mask = batch["attention_mask"].to(DEVICE)
                    outputs = model(input_ids, attention_mask)
                else:
                    inputs = batch[0].to(DEVICE)
                    outputs = model(inputs)

                labels = batch["label"].to(DEVICE) if model_type == "bert" else batch[1].to(DEVICE)
                val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Val Acc: {acc:.4f}")

In [7]:
def evaluate_model(model, data_loader, model_type="bert"):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            try:
                if model_type == "bert":
                    inputs = {
                        "input_ids": batch["input_ids"].to(DEVICE),
                        "attention_mask": batch["attention_mask"].to(DEVICE)
                    }
                    labels = batch["label"].to(DEVICE)
                    outputs = model(**inputs)
                else:
                    inputs = batch[0].to(DEVICE)
                    labels = batch[1].to(DEVICE)
                    outputs = model(inputs)

                preds = torch.argmax(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

            except Exception as e:
                print(f"Error during evaluation: {str(e)}")
                continue
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"\n{'='*50}")
    print("Final Evaluation Metrics")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds))
    print("\nConfusion Matrix:")
    print(confusion_matrix(all_labels, all_preds))

    return accuracy


In [8]:
if __name__ == "__main__":
    # BERT Training
    bert_train = SentimentDataset(dataset["train"], tokenizer_bert)
    bert_val = SentimentDataset(dataset["test"], tokenizer_bert)

    bert_loader = DataLoader(bert_train, batch_size=BATCH_SIZE, shuffle=True)
    bert_val_loader = DataLoader(bert_val, batch_size=BATCH_SIZE)

    bert_model = BertSentiment()
    print("Training BERT model:")
    train_model(bert_model, bert_loader, bert_val_loader)
    bert_accuracy = evaluate_model(bert_model, bert_val_loader)

    # GRU Training
    tokenizer_gru = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
    gru_train = SentimentDataset(dataset["train"], tokenizer_gru, for_bert=False)
    gru_val = SentimentDataset(dataset["test"], tokenizer_gru, for_bert=False)

    def collate_fn(batch):
        try:
            inputs = [item[0] for item in batch]
            labels = torch.stack([item[1] for item in batch])
            inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
            return inputs[:, :MAX_LEN], labels
        except Exception as e:
            print(f"Collate error: {str(e)}")
            actual_batch_size = len(batch)
            return torch.zeros((actual_batch_size, MAX_LEN), torch.zeros(actual_batch_size))

    gru_loader = DataLoader(gru_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    gru_val_loader = DataLoader(gru_val, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    gru_model = GRUSentiment()
    print("\nTraining GRU model:")
    train_model(gru_model, gru_loader, gru_val_loader, model_type="gru")
    gru_accuracy = evaluate_model(gru_model, gru_val_loader, model_type="gru")

    print("\n\nModel Comparison:")
    print(f"BERT Accuracy: {bert_accuracy:.4f}")
    print(f"GRU Accuracy: {gru_accuracy:.4f}")
    print(f"Accuracy Difference: {abs(bert_accuracy - gru_accuracy):.4f}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Training BERT model:




Epoch 1/3 | Val Acc: 0.8856
Epoch 2/3 | Val Acc: 0.8892
Epoch 3/3 | Val Acc: 0.8750

Final Evaluation Metrics
Accuracy: 0.8750

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.81      0.87     12500
           1       0.83      0.94      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.87     25000
weighted avg       0.88      0.88      0.87     25000


Confusion Matrix:
[[10152  2348]
 [  777 11723]]

Training GRU model:
Epoch 1/3 | Val Acc: 0.7942
Epoch 2/3 | Val Acc: 0.8278
Epoch 3/3 | Val Acc: 0.8216

Final Evaluation Metrics
Accuracy: 0.8216

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.82     12500
           1       0.81      0.84      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82     