In [1]:
from google.colab import drive
drive.mount('/content/drive')


import os
project_path = "/content/drive/MyDrive/PopBERT"
os.chdir(project_path)


import sys
sys.path.append(project_path)

Mounted at /content/drive


In [2]:
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import src
from src.bert import training
from src.bert.dataset import PBertDataset
from src.bert.dataset.strategies import MLMin1PopIdeol

In [3]:
EXCLUDE_CODERS: list[str] = []
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODEL = "deepset/gbert-large"   # Pre-trained German BERT model to be used (GBert-large)
BATCH_SIZE = 8          # Training batch size
N_EPOCHS = 3           # Number of training epochs
LR = 0.000009          # Learning rate
WEIGHT_DECAY = 0.01       # Weight decay (L2 regularization coefficient)

THRESHOLDS = {0: 0.415961, 1: 0.295400, 2: 0.429109, 3: 0.302714}   # Decision thresholds for different categories

This code defines critical hyperparameters and configurations for model training. It uses a German BERT model with carefully tuned parameters for what appears to be a multi-label classification task.

The unique thresholds (all deviating from the default 0.5) reveal a sophisticated optimization approach: high-precision decimals indicate fine-tuned calibration, while varying thresholds across categories reflect class-specific characteristics. Higher thresholds for categories 0 and 2 demand stronger confidence, while lower ones for categories 1 and 3 allow more lenient classification. This precision suggests rigorous testing and optimization tailored to each category's distinct requirements.

In [4]:
# Load training dataset
train = PBertDataset.from_disk(
    path=src.PATH / "data/labeled_data/train.csv.zip",
    label_strategy=MLMin1PopIdeol(),
    exclude_coders=EXCLUDE_CODERS,
)

# Load test dataset
test = PBertDataset.from_disk(
    path=src.PATH / "data/labeled_data/test.csv.zip",
    label_strategy=MLMin1PopIdeol(),
    exclude_coders=EXCLUDE_CODERS,
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)  # Load tokenizer from pre-trained model
collate_fn = train.create_collate_fn(tokenizer)   # Create collate function for batch processing

train_loader = DataLoader(train, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test, collate_fn=collate_fn, batch_size=64, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=train.num_labels).to(
    DEVICE
)

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


This code initializes the classification model using the pre-trained GBert-large model. It automatically configures the final classification layer based on the number of labels in the training set and moves the model to the appropriate computing device.

In [None]:
# Initialize AdamW optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR,
    amsgrad=False,
    weight_decay=WEIGHT_DECAY,
)


# Initialize cosine annealing learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=20,
    eta_min=LR / 10,  # Minimum learning rate (1/10 of original learning rate)
)

for epoch in range(1, N_EPOCHS + 1):
    train_loss = training.train_epoch(model, train_loader, optimizer, lr_scheduler, clip=5)
    eval_loss, score, _ = training.eval_epoch(model, test_loader)
    print(f"{epoch=} {train_loss=:.4f} {eval_loss=:.4f} {score=:.4f}")

epoch=1 train_loss=288.6422 eval_loss=7.3731 score=0.6822


epoch=2 train_loss=184.7465 eval_loss=6.7474 score=0.7238


epoch=3 train_loss=128.6823 eval_loss=6.8762 score=0.7074


I modified the original code to incorporate Weights & Biases (wandb) for training monitoring and visualization.

In [8]:
import wandb
from tqdm.notebook import tqdm
import time
from datetime import datetime, timedelta

wandb.login()


True

In [9]:
# Initialize wandb
wandb.init(
    project="bert-populism-classification",
    config={
        "model": MODEL,
        "batch_size": BATCH_SIZE,
        "learning_rate": LR,
        "epochs": N_EPOCHS,
        "weight_decay": WEIGHT_DECAY
    }
)

# Calculate total steps for estimating remaining time
total_steps = N_EPOCHS * len(train_loader)
current_step = 0
start_time = time.time()

# Create progress bar for epochs
epoch_pbar = tqdm(range(1, N_EPOCHS + 1), desc='Training Progress', position=0)


# Initialize AdamW optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=LR,
    amsgrad=False,
    weight_decay=WEIGHT_DECAY,
)


# Initialize cosine annealing learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=20,
    eta_min=LR / 10,
)


for epoch in epoch_pbar:
    # Record the start time of each epoch
    epoch_start = time.time()

    # Training phase
    train_loss = 0.0
    model.train()

    # Create progress bar for batches
    batch_pbar = tqdm(train_loader, desc=f'Epoch {epoch}/{N_EPOCHS}',
                     position=1, leave=False)

    for batch in batch_pbar:
        # Perform training step
        train_batch_loss = training.train_epoch(model, [batch], optimizer, lr_scheduler, clip=5)
        train_loss += train_batch_loss

        # Update step count and estimate remaining time
        current_step += 1
        elapsed_time = time.time() - start_time
        steps_per_sec = current_step / elapsed_time
        remaining_steps = total_steps - current_step
        eta_seconds = remaining_steps / steps_per_sec
        eta = str(timedelta(seconds=int(eta_seconds)))

        # Update batch progress bar
        batch_pbar.set_postfix({
            'loss': f'{train_batch_loss:.4f}',
            'lr': f'{lr_scheduler.get_last_lr()[0]:.2e}',
            'ETA': eta
        })

    # Evaluation phase
    eval_loss, score, _ = training.eval_epoch(model, test_loader)

    # Calculate time spent for the epoch
    epoch_time = time.time() - epoch_start

    # Update epoch progress bar
    epoch_pbar.set_postfix({
        'train_loss': f'{train_loss:.4f}',
        'eval_loss': f'{eval_loss:.4f}',
        'f1_score': f'{score:.4f}',
        'epoch_time': f'{epoch_time:.1f}s'
    })

    # Log metrics to wandb
    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "eval_loss": eval_loss,
        "f1_score": score,
        "learning_rate": lr_scheduler.get_last_lr()[0],
        "epoch_time": epoch_time
    })

# Close wandb after training is complete
wandb.finish()

Training Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1/3:   0%|          | 0/880 [00:00<?, ?it/s]

Epoch 2/3:   0%|          | 0/880 [00:00<?, ?it/s]

Epoch 3/3:   0%|          | 0/880 [00:00<?, ?it/s]

0,1
epoch,▁▅█
epoch_time,█▁▁
eval_loss,█▁▇
f1_score,▁▆█
learning_rate,▁▁▁
train_loss,█▄▁

0,1
epoch,3.0
epoch_time,98.8881
eval_loss,6.97463
f1_score,0.73002
learning_rate,1e-05
train_loss,125.98259


Training loss decreased significantly over the epochs, demonstrating that the model successfully learned from the data.

The F1 score improved across epochs, reaching 0.73 at the end, indicating good classification performance.
However, the evaluation loss slightly increased in the last epoch, which might suggest minor overfitting.



In [12]:
# Push model and tokenizer to Hugging Face
from huggingface_hub import HfFolder

HfFolder.save_token("your_token")

hf_repo_name = "BarryzZ/bert-populism-classification"


model.push_to_hub(hf_repo_name)
tokenizer.push_to_hub(hf_repo_name)

print(f"Model successfully uploaded to Hugging Face Hub")


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model successfully uploaded to Hugging Face Hub


In [10]:
def apply_thresh(y_proba, thresholds: dict):
    y_proba = y_proba.copy()
    for dim, thresh in thresholds.items():
        y_proba[:, dim] = np.where(y_proba[:, dim] > thresh, 1, 0)
    return y_proba

This function, apply_thresh, adjusts probabilities in a multi-dimensional array to binary values (0 or 1) based on specified thresholds for each dimension.

In [14]:
with torch.inference_mode():                      #torch.inference_mode() is a context manager provided by PyTorch that disables gradient calculation during inference. This means the model will not compute and store gradients, saving memory and speeding up the inference process.
    y_true = []
    y_pred = []
    for batch in test_loader:
        encodings = batch["encodings"]
        encodings = encodings.to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        out = model(**encodings)               #The model makes predictions using the input encodings. **encodings unpacks the dictionary into keyword arguments.
        preds = torch.nn.functional.sigmoid(out.logits)
        y_true.extend(batch["labels"].numpy())
        y_pred.extend(preds.cpu().numpy())
    y_pred_05 = np.where(np.array(y_pred) > 0.5, 1, 0)    #Apply a threshold of 0.5 to the predicted probabilities
    y_pred_thresh = apply_thresh(np.array(y_pred), THRESHOLDS)  #Apply a custom threshold function
    y_true = np.array(y_true)

This code performs predictions using the trained model on the test set, collects the true and predicted values, and then applies thresholds (0.5 or custom) to convert the predicted probabilities into binary class labels.

In [None]:
print(
    classification_report(
        y_true,
        y_pred_05,
        target_names=["elite", "pplcentr", "left", "right"],
        zero_division=0,
    )
)

              precision    recall  f1-score   support

       elite       0.85      0.84      0.85       648
    pplcentr       0.71      0.62      0.66       322
        left       0.71      0.71      0.71       279
       right       0.82      0.48      0.61       155

   micro avg       0.79      0.73      0.76      1404
   macro avg       0.77      0.66      0.71      1404
weighted avg       0.79      0.73      0.75      1404
 samples avg       0.38      0.37      0.36      1404



In [None]:
print(
    classification_report(
        y_true,
        y_pred_thresh,
        target_names=["elite", "pplcentr", "left", "right"],
        zero_division=0,
    )
)

              precision    recall  f1-score   support

       elite       0.85      0.86      0.85       648
    pplcentr       0.65      0.71      0.68       322
        left       0.70      0.73      0.71       279
       right       0.76      0.58      0.66       155

   micro avg       0.76      0.77      0.76      1404
   macro avg       0.74      0.72      0.73      1404
weighted avg       0.76      0.77      0.76      1404
 samples avg       0.39      0.39      0.38      1404



Report a default 0.5 threshold and a custom threshold
- elite: Slight improvement in recall (0.86) with unchanged F1 (0.85).
- pplcentr: Precision drops (0.65), but recall increases (0.71), improving F1 (0.68).
- left: Improved recall (0.73) and F1 (0.71)
- right: Precision (0.76) and recall (0.58) improve, leading to a higher F1

The custom threshold improves recall and F1 scores for some classes, especially pplcentr and left. The default 0.5 threshold in y_pred_05 misses some true positives, while the custom thresholds in y_pred_thresh improve recall and F1, particularly for pplcentr and left.



The result above is the original one, and the one below is the result I reproduced.

In [15]:
print(
    classification_report(
        y_true,
        y_pred_05,
        target_names=["elite", "pplcentr", "left", "right"],
        zero_division=0,
    )
)

              precision    recall  f1-score   support

       elite       0.81      0.89      0.85       648
    pplcentr       0.70      0.69      0.70       322
        left       0.69      0.77      0.73       279
       right       0.79      0.55      0.65       155

   micro avg       0.76      0.78      0.77      1404
   macro avg       0.75      0.72      0.73      1404
weighted avg       0.76      0.78      0.77      1404
 samples avg       0.40      0.39      0.39      1404



In [16]:
print(
    classification_report(
        y_true,
        y_pred_thresh,
        target_names=["elite", "pplcentr", "left", "right"],
        zero_division=0,
    )
)

              precision    recall  f1-score   support

       elite       0.80      0.89      0.84       648
    pplcentr       0.66      0.75      0.70       322
        left       0.68      0.79      0.73       279
       right       0.68      0.68      0.68       155

   micro avg       0.73      0.82      0.77      1404
   macro avg       0.71      0.78      0.74      1404
weighted avg       0.73      0.82      0.77      1404
 samples avg       0.40      0.41      0.40      1404



In [None]:
model.save_pretrained(src.PATH / "results/popbert_model")