# Political Bias Classification with BERT Variants

This Jupyter notebook walks through training four BERT-based classifiers on a news-bias dataset. It is GPU-ready and uses PyTorch & Hugging Face Transformers.

## 1. Install and Import Dependencies

In [None]:
!pip install transformers datasets torch scikit-learn tqdm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
import pickle

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## 2. Load and Prepare the Data

In [4]:
# Path to your CSV
csv_path = "/content/political.csv"

df = pd.read_csv(csv_path)[:2500]

In [3]:
print(df.label.value_counts())

label
0    7723
2    5554
1    4008
Name: count, dtype: int64


In [5]:
# Train/test split
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df.label, random_state=42)

## 3. Create a PyTorch Dataset

In [9]:
class NewsBiasDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding.input_ids.squeeze(),
            'attention_mask': encoding.attention_mask.squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 4. Define Training and Evaluation Loops

In [10]:
class BertANNClassifier(nn.Module):
    def __init__(self, model_name, num_labels, d1=256, d2=128, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        # enable gradient checkpointing to reduce memory
        self.bert.gradient_checkpointing_enable()
        hidden_size = self.bert.config.hidden_size
        self.fc1 = nn.Linear(hidden_size, d1)
        self.fc2 = nn.Linear(d1, d2)
        self.out = nn.Linear(d2, num_labels)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls = outputs.last_hidden_state[:, 0, :]
        x = self.dropout(cls)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        return self.out(x)

In [11]:
def train_epoch(model, loader, optimizer, scheduler, criterion, scaler):
    model.train()
    losses, preds, true = [], [], []
    for batch in tqdm(loader, desc='Training'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        with autocast():
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        losses.append(loss.item())
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true.extend(labels.cpu().numpy())

    return sum(losses)/len(losses), accuracy_score(true, preds), f1_score(true, preds, average='weighted')


def eval_epoch(model, loader, criterion):
    model.eval()
    losses, preds, true = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            losses.append(loss.item())
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true.extend(labels.cpu().numpy())
    return sum(losses)/len(losses), accuracy_score(true, preds), f1_score(true, preds, average='weighted')

## 5. Train BERT Variants

In [12]:
model_names = [
    'bert-base-uncased',
    'distilbert-base-uncased',
    'bert-base-cased'
]
num_labels = df.label.nunique()
results = {}

# Helper to create DataLoader
def make_loader(ds, batch_size):
     return DataLoader(
         ds, batch_size=batch_size,
         shuffle=(batch_size>1), num_workers=4, pin_memory=True
     )

for name in model_names:
    print(f"=== Training {name} + ANN ===")
    tokenizer = BertTokenizer.from_pretrained(name)
    train_ds = NewsBiasDataset(train_df.text.values, train_df.label.values, tokenizer)
    val_ds   = NewsBiasDataset(val_df.text.values,   val_df.label.values,   tokenizer)
    train_loader = make_loader(train_ds, batch_size=8)
    val_loader   = make_loader(val_ds,   batch_size=16)

    model = BertANNClassifier(name, num_labels).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 10  # allow up to 10 epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    scaler = GradScaler()

    best_f1 = 0.0
    patience, patience_counter = 2, 0  # early stopping after 2 epochs without improvement
    max_epochs = 10
    for epoch in range(1, max_epochs + 1):
        t_loss, t_acc, t_f1 = train_epoch(model, train_loader, optimizer, scheduler, criterion, scaler)
        v_loss, v_acc, v_f1 = eval_epoch(model, val_loader, criterion)
        print(f"Epoch {epoch}: Train F1={t_f1:.4f}  Val F1={v_f1:.4f}")

        # Early stopping logic
        if v_f1 > best_f1:
            best_f1 = v_f1
            patience_counter = 0
            # Save best checkpoints
            model.bert.save_pretrained(f"./ckpt/{name}_bert_best")
            torch.save(model.state_dict(), f"./ckpt/{name}_ann_best.pt")
            # Also save full model object as a pickle
            with open(f"./ckpt/{name}_full_model.pkl", "wb") as f:
                pickle.dump(model, f)
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Stopping early at epoch {epoch} (no improvement in {patience} epochs)")
                break

    results[name] = best_f1



=== Training bert-base-uncased + ANN ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  scaler = GradScaler()


Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 1: Train F1=0.4904  Val F1=0.6640




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 2: Train F1=0.7138  Val F1=0.6420




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 3: Train F1=0.8234  Val F1=0.7031




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 4: Train F1=0.9068  Val F1=0.7075




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 5: Train F1=0.9478  Val F1=0.7040




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 6: Train F1=0.9724  Val F1=0.7046
Stopping early at epoch 6 (no improvement in 2 epochs)
=== Training distilbert-base-uncased + ANN ===


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.l

Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 1: Train F1=0.2954  Val F1=0.2772




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 2: Train F1=0.2976  Val F1=0.2772




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 3: Train F1=0.2834  Val F1=0.2772
Stopping early at epoch 3 (no improvement in 2 epochs)
=== Training bert-base-cased + ANN ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

  scaler = GradScaler()


Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 1: Train F1=0.5335  Val F1=0.6314




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 2: Train F1=0.7339  Val F1=0.7012




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 3: Train F1=0.8270  Val F1=0.7610




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 4: Train F1=0.8899  Val F1=0.7529




Training:   0%|          | 0/282 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 5: Train F1=0.9458  Val F1=0.7113
Stopping early at epoch 5 (no improvement in 2 epochs)


In [13]:
print("Best validation F1 scores:")

for name, f1 in results.items():
    print(f"{name}: {f1:.4f}")

Best validation F1 scores:
bert-base-uncased: 0.7075
distilbert-base-uncased: 0.2772
bert-base-cased: 0.7610
