In [2]:
import os
import random
from pathlib import Path
from pprint import pprint
import numpy as np
import torch

try:
    from datasets import load_dataset
    HAS_DATASETS = True
except Exception:
    HAS_DATASETS = False

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ROOT = Path.cwd()
DATA_DIR = ROOT / "data" / "imdb"
MODELS_DIR = ROOT / "models" / "ann_imdb"
SCRIPTS_DIR = ROOT / "scripts"

for d in (DATA_DIR, MODELS_DIR, SCRIPTS_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("Working directory:", ROOT)
print("Data directory   :", DATA_DIR)
print("Models directory :", MODELS_DIR)
print("Scripts directory:", SCRIPTS_DIR)
print("Has HuggingFace datasets library?:", HAS_DATASETS)
print("PyTorch version  :", torch.__version__)
print("Device           :", DEVICE)

Working directory: /home/manmath/Desktop/MyProjects/End-to-End-ANN-for-IMDB-Sentiment/Notebook
Data directory   : /home/manmath/Desktop/MyProjects/End-to-End-ANN-for-IMDB-Sentiment/Notebook/data/imdb
Models directory : /home/manmath/Desktop/MyProjects/End-to-End-ANN-for-IMDB-Sentiment/Notebook/models/ann_imdb
Scripts directory: /home/manmath/Desktop/MyProjects/End-to-End-ANN-for-IMDB-Sentiment/Notebook/scripts
Has HuggingFace datasets library?: False
PyTorch version  : 2.5.1+cu121
Device           : cuda


In [4]:
import sys
import subprocess
from pathlib import Path
import random

MAX_PER_LABEL = 1000   


try:
    from datasets import load_dataset
    print("datasets already present.")
except Exception:
    print("datasets not found — installing 'datasets' via pip inside the environment. This may take 20-60s.")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "datasets"])
    from datasets import load_dataset
    print("datasets installed.")

print(f"\nLoading IMDB dataset (max {MAX_PER_LABEL} pos + {MAX_PER_LABEL} neg for train)...")
ds = load_dataset("imdb")
train = ds["train"]
test = ds["test"]

def subsample_by_label(hf_dataset, max_per_label=MAX_PER_LABEL, seed=42):
    pos, neg = [], []
    for ex in hf_dataset:
        if ex["label"] == 1:
            pos.append(ex["text"])
        else:
            neg.append(ex["text"])

    random.seed(seed)
    random.shuffle(pos)
    random.shuffle(neg)

    pos = pos[:max_per_label]
    neg = neg[:max_per_label]

    texts = pos + neg
    labels = [1]*len(pos) + [0]*len(neg)
    combined = list(zip(texts, labels))
    random.shuffle(combined)
    texts, labels = zip(*combined)

    return list(texts), list(labels)


train_texts, train_labels = subsample_by_label(train)
test_texts, test_labels = subsample_by_label(test, min(500, MAX_PER_LABEL))

print("Train size:", len(train_texts), " Test size:", len(test_texts))

print("\n--- Sample Positive examples (5) ---")
pos_count = 0
for t, l in zip(train_texts, train_labels):
    if l == 1:
        print("-", t[:200].replace("\n", " "))
        pos_count += 1
        if pos_count >= 5:
            break

print("\n--- Sample Negative examples (5) ---")
neg_count = 0
for t, l in zip(train_texts, train_labels):
    if l == 0:
        print("-", t[:200].replace("\n", " "))
        neg_count += 1
        if neg_count >= 5:
            break

DATA_DIR = Path("data/imdb")
DATA_DIR.mkdir(parents=True, exist_ok=True)

import json
with open(DATA_DIR / "train_small.jsonl", "w", encoding="utf8") as fw:
    for txt, lab in zip(train_texts, train_labels):
        fw.write(json.dumps({"text": txt, "label": int(lab)}) + "\n")

with open(DATA_DIR / "test_small.jsonl", "w", encoding="utf8") as fw:
    for txt, lab in zip(test_texts, test_labels):
        fw.write(json.dumps({"text": txt, "label": int(lab)}) + "\n")

print("\nSaved small train/test to:", DATA_DIR)

datasets already present.

Loading IMDB dataset (max 1000 pos + 1000 neg for train)...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train size: 2000  Test size: 1000

--- Sample Positive examples (5) ---
- Ok, so it's an adult movie. But it really is very tastefully done. It's obvious that the producers spent a lot of time and money into making a classy sort of movie. I was pleasantly surprised at just 
- "Silverlake Life" is a documentary and it was plain and straightforward. Actually, it was more like a home movie, and if you want dramatic illuminations, see something else. And it's by no means a tea
- This early sci-fi masterwork by Herbert George Wells with music by Arthur Bliss is a powerful piece of film-making. Adapted from Wells' somewhat different work by the author, it presents a look at the
- Of the three remakes on W. Somerset Maughan's novel, this one is the best one, and not particularly because what John Cromwell brought to the film. The film is worth a look because of the break throug
- if you are like me then you will love this great coming of age teen movie.i think it is up there with mischief/boo

In [9]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch

TOKENIZER_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

MAX_LEN = 200  

class IMDBDataset(Dataset):
    def _init_(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def _len_(self):
        return len(self.texts)

    def _getitem_(self, idx):
        text = self.texts[idx]

        encoded = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].squeeze(0)      
        attention_mask = encoded["attention_mask"].squeeze(0)  
        label = torch.tensor(self.labels[idx])

        return input_ids, attention_mask, label

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorWithPadding
import torch

MAX_LEN = 200   

class IMDBHFDataset(Dataset):
    """Yields tokenized dicts suitable for HF collators."""
    def __init__(self, texts, labels, tokenizer, max_length=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        txt = self.texts[idx]
        encoded = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding=False,           
            return_attention_mask=True,
            return_tensors=None       
        )
        encoded["labels"] = int(self.labels[idx])
        return encoded

train_dataset_hf = IMDBHFDataset(train_texts, train_labels, tokenizer, max_length=MAX_LEN)
test_dataset_hf  = IMDBHFDataset(test_texts, test_labels, tokenizer, max_length=MAX_LEN)

collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors="pt")

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset_hf, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
test_loader  = DataLoader(test_dataset_hf, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)

batch = next(iter(train_loader))
print("Batch keys:", list(batch.keys()))
print(" input_ids shape:", batch["input_ids"].shape)        
print(" attention_mask shape:", batch["attention_mask"].shape)
print(" labels shape:", batch["labels"].shape)
print(" sample labels (first 8):", batch["labels"][:8].tolist())

Batch keys: ['input_ids', 'attention_mask', 'labels']
 input_ids shape: torch.Size([32, 200])
 attention_mask shape: torch.Size([32, 200])
 labels shape: torch.Size([32])
 sample labels (first 8): [1, 0, 0, 0, 0, 1, 0, 0]


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm

class SimpleTextANN(nn.Module):
    """
    Embedding -> masked mean pooling -> small MLP -> binary logit output
    """
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, pad_id=0, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim, padding_idx=pad_id)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, 1)   

    def forward(self, input_ids, attention_mask):
        """
        input_ids: [batch, seq_len]
        attention_mask: [batch, seq_len] (1 for tokens, 0 for padding)
        returns logits: [batch] (raw single logit per example)
        """

        emb = self.embedding(input_ids)

        mask = attention_mask.unsqueeze(-1).type(emb.dtype)      
        emb_masked = emb * mask                                 
        summed = emb_masked.sum(dim=1)                         
        lengths = mask.sum(dim=1).clamp(min=1.0)                 
        pooled = summed / lengths                                

        x = F.relu(self.fc1(pooled))
        x = self.dropout(x)
        logits = self.fc2(x).squeeze(-1)  
        return logits

def train_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for batch in tqdm(dataloader, desc="train", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device).float()

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)         
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * input_ids.size(0)
        preds = (torch.sigmoid(logits) >= 0.5).long()
        correct += (preds == labels.long()).sum().item()
        total += input_ids.size(0)

    avg_loss = running_loss / total
    acc = correct / total
    return avg_loss, acc

def eval_epoch(model, dataloader, loss_fn, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="eval", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device).float()

            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)

            running_loss += loss.item() * input_ids.size(0)
            preds = (torch.sigmoid(logits) >= 0.5).long()
            correct += (preds == labels.long()).sum().item()
            total += input_ids.size(0)

    avg_loss = running_loss / total
    acc = correct / total
    return avg_loss, acc

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

vocab_size = tokenizer.vocab_size if hasattr(tokenizer, "vocab_size") else tokenizer.get_vocab_size()
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

model = SimpleTextANN(vocab_size=vocab_size, embed_dim=128, hidden_dim=128, pad_id=pad_id, dropout=0.3).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
loss_fn = nn.BCEWithLogitsLoss()

EPOCHS = 3   
for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, loss_fn, DEVICE)
    val_loss, val_acc = eval_epoch(model, test_loader, loss_fn, DEVICE)
    print(f"Epoch {epoch:02d}  train_loss={train_loss:.4f} train_acc={train_acc:.4f}  val_loss={val_loss:.4f} val_acc={val_acc:.4f}")

torch.save(model.state_dict(), "models/ann_imdb_simple.pt")
print("Saved model -> models/ann_imdb_simple.pt")

Using device: cuda


train:   0%|          | 0/63 [00:00<?, ?it/s]

eval:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 01  train_loss=0.6934 train_acc=0.5110  val_loss=0.6918 val_acc=0.5130


train:   0%|          | 0/63 [00:00<?, ?it/s]

eval:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 02  train_loss=0.6904 train_acc=0.5455  val_loss=0.6895 val_acc=0.5790


train:   0%|          | 0/63 [00:00<?, ?it/s]

eval:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 03  train_loss=0.6871 train_acc=0.6005  val_loss=0.6866 val_acc=0.6030
Saved model -> models/ann_imdb_simple.pt
