In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path
import torch
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [2]:
import os
import urllib.request
import time
import sys
import tarfile

def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
    else:
        duration = time.time() - start_time
        progress_size = int(count * block_size)
        percent = count * block_size * 100 / total_size

        speed = int(progress_size / (1024 * duration)) if duration else 0
        sys.stdout.write(
            f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB "
            f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed"
        )
        sys.stdout.flush()

def download_and_extract_dataset(dataset_url, target_file, directory):
    if not os.path.exists(target_file):
        if os.path.exists(target_file):
            os.remove(target_file)
        urllib.request.urlretrieve(dataset_url, target_file, reporthook)
        print("\nExtracting dataset ...")
        with tarfile.open(target_file, 'r:gz') as tar:
            tar.extractall()
    else:
        print(f"Directory '{directory}' already exists. Skipping download.")

def load_dataset_to_dataframe(basepath='aclImdb', labels={"pos": 1, "neg":0}):
    data_frames = []
    for subset in ("test", "train"):
        for label in ("pos", "neg"):
            path = os.path.join(basepath, subset, label)
            for file in sorted(os.listdir(path)):
                with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                    data_frames.append(pd.DataFrame({"text": [infile.read()], "label": [labels[label]]}))
    #concat all df chunks together
    df = pd.concat(data_frames, ignore_index=True)
    df = df.sample(frac=1, random_state=123).reset_index(drop=True) #shuffle df
    return df

def partition_and_save(df, sizes=(35000, 5000, 10000)):
    #shuffle df
    df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
    
    #id where to split data
    train_end = sizes[0]
    val_end = sizes[0] + sizes[1]

    train = df_shuffled.iloc[:train_end]
    val = df_shuffled.iloc[train_end:val_end]
    test = df_shuffled.iloc[val_end:]

    train.to_csv("train_imdb.csv", index=False)
    val.to_csv("validation_imdb.csv", index=False)
    test.to_csv("test_imdb.csv", index=False)


In [3]:
dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
print("Downloading ...")
download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb")
print("Creating dataframes ...")
df = load_dataset_to_dataframe()
print("Partitioning and saving dataframes ...")
partition_and_save(df)

Downloading ...
100% | 80.23 MB | 6268.00 MB/s | 13.11 sec elapsed
Extracting dataset ...
Creating dataframes ...
Partitioning and saving dataframes ...


In [4]:
class IMBDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256, use_attention_mask=False):
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length if max_length is not None else self._longest_encoded_length(tokenizer)
        self.pad_token_id = pad_token_id
        self.use_attention_mask = use_attention_mask

        self.encoded_texts = [
            tokenizer.encode(text, truncation=True, max_length=self.max_length)
            for text in self.data["text"]
        ]
        self.encoded_texts = [
            et + [pad_token_id] * (self.max_length - len(et))
            for et in self.encoded_texts
        ]

        if self.use_attention_mask:
            self.attention_masks = [
                self._create_attention_mask(et)
                for et in self.encoded_texts
            ]
        else:
            self.attention_masks = None

    def _create_attention_mask(self, encoded_text):
        return [1 if token_id != self.pad_token_id else 0 for token_id in encoded_text]
    
    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["label"]

        if self.use_attention_mask:
            attention_mask = self.attention_masks[index]
        else:
            attention_mask = torch.ones(self.max_length, dtype=torch.long)
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(attention_mask, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
    
    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self, tokenizer):
        max_length = 0
        for text in self.data["text"]:
            encoded_length = len(tokenizer.encoded(text))
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length
    
def calc_loss_batch(inputs_batch, attention_mask_batch, target_batch, model, device):
    attention_mask_batch = attention_mask_batch.to(device)
    input_batch, target_batch = inputs_batch.to(device), target_batch.to(device)

    logits = model(input_batch, attention_mask=attention_mask_batch).logits
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, attention_mask_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, attention_mask_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

@torch.no_grad()
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, attention_mask_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            attention_mask_batch = attention_mask_batch.to(device)
            input_batch, target_batch = input_batch.to(device), target_batch.to(device)
            #logits = model(input_batch)[:, -1, :]
            logits = model(input_batch, attention_mask=attention_mask_batch).logits
            predicted_labels = torch.argmax(logits, dim=1)
            num_examples += predicted_labels.shape[0]
            correct_predictions += (predicted_labels == target_batch).sum().item()
        else:
            break
    return correct_predictions / num_examples

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def train_classifier_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, max_steps=None):
    train_losses, val_losses, train_accs, val_accs = [], [], [], []
    examples_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()

        for input_batch, attention_mask_batch, target_batch in train_loader:
            optimizer.zero_grad() #Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, attention_mask_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            examples_seen += input_batch.shape[0] # New: track examples instead of tokens
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
                
            if max_steps is not None and global_step > max_steps:
                break

        #Calculate acc after each epoch
        train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
        val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
        print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
        print(f"Validattion accuracy: {val_accuracy*100:.2f}%")
        train_accs.append(train_accuracy)
        val_accs.append(val_accuracy)
    
    return train_losses, val_losses, train_accs, val_accs, examples_seen

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.classifier = torch.nn.Linear(in_features=768, out_features=2)
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True
for param in model.bert.pooler.dense.parameters():
    param.requires_grad = True
for param in model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
train_dataset = IMBDataset(
    csv_file="train_imdb.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=True
)
val_dataset = IMBDataset(
    csv_file="validation_imdb.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=True
)
test_dataset = IMBDataset(
    csv_file="test_imdb.csv",
    max_length=256,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.pad_token_id,
    use_attention_mask=True
)

In [8]:
num_workers = 0
batch_size = 32

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
    model, train_loader, val_loader, optimizer, device, num_epochs=1, eval_freq=50, eval_iter=20, max_steps=None
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.704, Val loss 0.703
Ep 1 (Step 000050): Train loss 0.431, Val loss 0.405
Ep 1 (Step 000100): Train loss 0.316, Val loss 0.314
Ep 1 (Step 000150): Train loss 0.286, Val loss 0.296
Ep 1 (Step 000200): Train loss 0.332, Val loss 0.304
Ep 1 (Step 000250): Train loss 0.257, Val loss 0.285
Ep 1 (Step 000300): Train loss 0.268, Val loss 0.283
Ep 1 (Step 000350): Train loss 0.267, Val loss 0.268
Ep 1 (Step 000400): Train loss 0.291, Val loss 0.261
Ep 1 (Step 000450): Train loss 0.244, Val loss 0.281
Ep 1 (Step 000500): Train loss 0.238, Val loss 0.257
Ep 1 (Step 000550): Train loss 0.303, Val loss 0.273
Ep 1 (Step 000600): Train loss 0.218, Val loss 0.258
Ep 1 (Step 000650): Train loss 0.264, Val loss 0.291
Ep 1 (Step 000700): Train loss 0.240, Val loss 0.262
Ep 1 (Step 000750): Train loss 0.260, Val loss 0.269
Ep 1 (Step 000800): Train loss 0.227, Val loss 0.255
Ep 1 (Step 000850): Train loss 0.242, Val loss 0.246
Ep 1 (Step 000900): Train loss 0.212, Val loss

In [10]:
print("\nEvaluating on the full datasets ...\n")

train_accuracy = calc_accuracy_loader(train_loader, model, device)
val_accuracy = calc_accuracy_loader(val_loader, model, device)
test_accuracy = calc_accuracy_loader(test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")


Evaluating on the full datasets ...

Training accuracy: 90.81%
Validation accuracy: 90.62%
Test accuracy: 90.28%


In [11]:
torch.save(model.state_dict(), "imdb_bert_classifier.pth")

In [12]:
model_state_dict = torch.load("imdb_bert_classifier.pth", map_location=device)
model.load_state_dict(model_state_dict)

  model_state_dict = torch.load("imdb_bert_classifier.pth", map_location=device)


<All keys matched successfully>