In [1]:
import random
import time
from pprint import pprint

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

SEED = 42
MAX_LEN = 256
BATCH_SIZE = 32
MODEL_NAME = "distilbert-base-uncased"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(SEED)

print("Device: ", DEVICE)
print("Seed: ", SEED)
print("Max Length: ", MAX_LEN)
print("Batch Size: ", BATCH_SIZE)
print("Tokenizer: ", MODEL_NAME)
print("Torch Version: ", torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


Device:  cuda
Seed:  42
Max Length:  256
Batch Size:  32
Tokenizer:  distilbert-base-uncased
Torch Version:  2.9.1+cu128


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

CSV_PATH = "/home/manmath/Desktop/MyProjects/BiLSTM-IMDB-Sentiment-Classsfier/Data/IMDB Dataset.csv"
TEXT_COL = "review"
LABEL_COL = "sentiment"

df = pd.read_csv(CSV_PATH)

print("Columns:", df.columns)
print("Initial size: ", len(df))

df = df[[TEXT_COL, LABEL_COL]].dropna()

df[LABEL_COL] = df[LABEL_COL].map({
    "positive":1,
    "negative":0
})

df[TEXT_COL] = df[TEXT_COL].astype(str)
df[LABEL_COL] = df[LABEL_COL].astype(int)

print("Label Distribution")
print(df[LABEL_COL].value_counts())

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df[LABEL_COL]
)

train_texts = train_df[TEXT_COL].tolist()
train_labels = train_df[LABEL_COL].tolist()

test_texts = test_df[TEXT_COL].tolist()
test_labels = test_df[LABEL_COL].tolist()

print("\nTrain samples:", len(train_texts))
print("Test samples:", len(test_texts))

print("\nSample Example: ")
print({
    "text": train_texts[0][:200] + "...",
    "label": train_labels[0]
})

Columns: Index(['review', 'sentiment'], dtype='object')
Initial size:  50000
Label Distribution
sentiment
1    25000
0    25000
Name: count, dtype: int64

Train samples: 40000
Test samples: 10000

Sample Example: 


In [3]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import torch

MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("Tokenizer vocab size: ", len(tokenizer.get_vocab()))

class IMDBHFDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length: int = MAX_LEN):
        self.texts = list(texts)
        self.labels = [int(i) for i in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        txt = str(self.texts[idx])
        label = self.labels[idx]

        encoded = self.tokenizer(
            txt, 
            truncation = True,
            max_length = self.max_length,
            padding = False,
            return_attention_mask = True,
            return_tensors = None
        )

        encoded["labels"] = label
        return encoded

collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",
    return_tensors="pt"
)

train_dataset = IMDBHFDataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
test_dataset = IMDBHFDataset(test_texts, test_labels, tokenizer, max_length=MAX_LEN)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collator
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collator
)

batch = next(iter(train_loader))
print("Batch Keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)
print("attention_mask shape:", batch["attention_mask"].shape)
print("labels shape:", batch["labels"].shape)
print("First 8 Labels: ", batch["labels"][:8])

Tokenizer vocab size:  30522
Batch Keys: KeysView({'input_ids': tensor([[ 101, 2568, 2017,  ..., 2007, 2784,  102],
        [ 101, 1045, 2228,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2026, 6898,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 1, 1])})
input_ids shape: torch.Size([32, 256])
attention_mask shape: torch.Size([32, 256])
labels shape: torch.Size([32])
First 8 Labels:  tensor([0, 0, 1, 1, 1, 0, 1, 0])


In [4]:
import torch
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=200, hidden_dim=128, num_layers=1, pad_idx=0, dropout=0.3):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=pad_idx
        )

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)

        self.fc = nn.Linear(hidden_dim*2, 1)
    
    def forward(self, input_ids, attention_mask=None):
        emb = self.embedding(input_ids)

        outputs, (h_n, c_n) = self.lstm(emb)

        forward_final = h_n[-2]
        backward_final = h_n[-1]

        hidden = torch.cat([forward_final, backward_final], dim=1)
        x = self.dropout(hidden)

        logits = self.fc(x).squeeze(-1)

        return logits

vocab_size = tokenizer.vocab_size
pad_id = tokenizer.pad_token_id or 0

model = BiLSTMClassifier(
    vocab_size=vocab_size,
    embed_dim=128,
    hidden_dim=256,
    num_layers=1,
    pad_idx=pad_id,
    dropout=0.3,
).to(DEVICE)

batch = next(iter(train_loader))
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)

with torch.no_grad():
    logits = model(input_ids, attention_mask)

print("Batch input_ids shape: ", input_ids.shape)
print("Logits Shape: ", logits.shape)


Batch input_ids shape:  torch.Size([32, 256])
Logits Shape:  torch.Size([32])


In [5]:
import torch
import torch.nn as nn
import time

EPOCHS = 5
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

best_val_loss = float("inf")
best_path = "bilstm_imdb_best.pt"

print(f"Training on device: {DEVICE}")
print(f"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}")

for epoch in range(1, EPOCHS + 1):
    t0 = time.time()
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # -------- TRAIN LOOP --------
    for step, batch in enumerate(train_loader, start=1):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].float().to(DEVICE)   # shape [B]

        logits = model(input_ids, attention_mask)     # [B]
        loss = loss_fn(logits, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)
        preds = (torch.sigmoid(logits) >= 0.5).long()
        correct += (preds == labels.long()).sum().item()
        total += labels.size(0)

        if step % 100 == 0:
            avg_loss = running_loss / total
            acc = correct / total
            print(f"  step {step:04d}  avg_loss={avg_loss:.4f}  acc={acc:.4f}")

    train_loss = running_loss / total
    train_acc = correct / total

    # -------- VALIDATION LOOP --------
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].float().to(DEVICE)

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            val_loss += loss.item() * labels.size(0)
            preds = (torch.sigmoid(logits) >= 0.5).long()
            val_correct += (preds == labels.long()).sum().item()
            val_total += labels.size(0)

    val_loss = val_loss / val_total
    val_acc = val_correct / val_total
    dt = time.time() - t0

    print(f"Epoch {epoch:02d}  TrainLoss={train_loss:.4f}  TrainAcc={train_acc:.4f}  "
          f"ValLoss={val_loss:.4f}  ValAcc={val_acc:.4f}  Time={dt:.1f}s")

    # save best
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_path)
        print(f"  -> Saved best model to {best_path}")

Training on device: cuda
Train batches: 1250, Test batches: 313
  step 0100  avg_loss=0.6913  acc=0.5294
  step 0200  avg_loss=0.6875  acc=0.5475
  step 0300  avg_loss=0.6822  acc=0.5602
  step 0400  avg_loss=0.6769  acc=0.5754
  step 0500  avg_loss=0.6728  acc=0.5839
  step 0600  avg_loss=0.6668  acc=0.5919
  step 0700  avg_loss=0.6617  acc=0.6017
  step 0800  avg_loss=0.6597  acc=0.6042
  step 0900  avg_loss=0.6630  acc=0.5978
  step 1000  avg_loss=0.6643  acc=0.5954
  step 1100  avg_loss=0.6630  acc=0.5971
  step 1200  avg_loss=0.6571  acc=0.6043
Epoch 01  TrainLoss=0.6510  TrainAcc=0.6105  ValLoss=0.5281  ValAcc=0.7541  Time=64.5s
  -> Saved best model to bilstm_imdb_best.pt
  step 0100  avg_loss=0.5401  acc=0.7178
  step 0200  avg_loss=0.5447  acc=0.7166
  step 0300  avg_loss=0.5418  acc=0.7189
  step 0400  avg_loss=0.5222  acc=0.7340
  step 0500  avg_loss=0.5034  acc=0.7514
  step 0600  avg_loss=0.4853  acc=0.7646
  step 0700  avg_loss=0.4711  acc=0.7744
  step 0800  avg_loss=0.4