In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
project_path = "/content/drive/MyDrive/PopBERT"
os.chdir(project_path)

import sys
sys.path.append(project_path)

Mounted at /content/drive


In [2]:
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import src
from src.bert import training
from src.bert.dataset import PBertDataset
from src.bert.dataset.strategies import MLMin1PopIdeol

In [3]:
EXCLUDE_CODERS: list[str] = []
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODEL = "deepset/gbert-large"   # Pre-trained German BERT model to be used (GBert-large)
BATCH_SIZE = 8          # Training batch size
N_EPOCHS = 3           # Number of training epochs
LR = 0.000009          # Learning rate
WEIGHT_DECAY = 0.01       # Weight decay (L2 regularization coefficient)

THRESHOLDS = {0: 0.415961, 1: 0.295400, 2: 0.429109, 3: 0.302714}   # Decision thresholds for different categories

This code defines critical hyperparameters and configurations for model training. It uses a German BERT model with carefully tuned parameters for what appears to be a multi-label classification task.

The unique thresholds (all deviating from the default 0.5) reveal a sophisticated optimization approach: high-precision decimals indicate fine-tuned calibration, while varying thresholds across categories reflect class-specific characteristics. Higher thresholds for categories 0 and 2 demand stronger confidence, while lower ones for categories 1 and 3 allow more lenient classification. This precision suggests rigorous testing and optimization tailored to each category's distinct requirements.

In [4]:
# Load training dataset
train = PBertDataset.from_disk(
    path=src.PATH / "data/labeled_data/train.csv.zip",
    label_strategy=MLMin1PopIdeol(),
    exclude_coders=EXCLUDE_CODERS,
)

# Load test dataset
test = PBertDataset.from_disk(
    path=src.PATH / "data/labeled_data/test.csv.zip",
    label_strategy=MLMin1PopIdeol(),
    exclude_coders=EXCLUDE_CODERS,
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)  # Load tokenizer from pre-trained model
collate_fn = train.create_collate_fn(tokenizer)   # Create collate function for batch processing

train_loader = DataLoader(train, collate_fn=collate_fn, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test, collate_fn=collate_fn, batch_size=64, shuffle=False)

Tokenizer loaded successfully


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=train.num_labels).to(
    DEVICE
)

Model initialized with classifier layers. Training required for optimal performance.


This code initializes the classification model using the pre-trained GBert-large model. It automatically configures the final classification layer based on the number of labels in the training set and moves the model to the appropriate computing device.