To do:
- Have a look at learning rate and gradient norm clipping which I need to read up on.
    - Setting learning rate to 1e-4 from the "Embedding sweep" section of the mmBERT paper
    - Keeping gradient norm clipping to the default which caps it at 1.0

- Hyperparameter tuning (Alpha, learning rate, batch size so on - not sure how to figure this out)
    - There is precedence for no hyperparameter tuning from the author of the OG NLI model that DEBATE is based on = Due to computational restrains and the points from this paper, no hyperparameter tuning was performed in this case. The model tuning in itself is also not the primary focus in this paper, but simply serves as a tool for the actual inquiry into blame in the Danish Parliament



In [None]:
%pip install -r "requirements_bert.txt"

In [1]:
import torch
import transformers
import bitsandbytes
import accelerate
import datasets
import numpy as np
import pandas as pd
import keras
import json
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from sklearn.model_selection import train_test_split
from datasets import Dataset
from keras.losses import binary_crossentropy
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score, precision_score

  from .autonotebook import tqdm as notebook_tqdm
2025-10-28 11:50:49.876407: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-28 11:50:49.928281: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-28 11:50:50.915157: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
model_name = "jhu-clsp/mmBERT-base"

quantization_config = BitsAndBytesConfig(
                                        load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",  # Fine-tuning all linear (classification, attention... layers)
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,416,096 || all params: 310,947,874 || trainable%: 1.0986


In [None]:
training_args = TrainingArguments(
    report_to='wandb',
    output_dir='./test_tune_results',
    optim="paged_adamw_8bit",
    learning_rate=1e-4, # Learning rate copied from mmBERT paper on embedding sweep of LR (1e-4) as they found this to perform best
    num_train_epochs=3,
    per_device_train_batch_size=128, # Batching at 256 to balance generalization and efficient training
    gradient_accumulation_steps=1,  # Gradient of 1 as full batch fits in memory, accumulation then only slows
    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    bf16=True,  # Enable mixed precision
    fp16=False,
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    remove_unused_columns=True, # Avoiding manual handling of residual text columns
    max_grad_norm=1.0,
    disable_tqdm=False,
)

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], 
    padding="max_length",
    truncation=True,
    max_length=512, # Padding to 512 to massively cut down on computation compared to base 8,192 tokens. 
    )

In [6]:
def weighted_bincrossentropy(true, pred, weight_zero = 99.0, weight_one = 1):
    """
    Calculates weighted binary cross entropy. The weights are fixed to represent class imbalance in the dataset.
        
    For example if there are 10x as many positive classes as negative classes,
        if you adjust weight_zero = 1.0, weight_one = 0.1, then false positives 
        will be penalized 10 times as much as false negatives.

    """
  
    # calculate the binary cross entropy
    bin_crossentropy = binary_crossentropy(true, pred)
    
    # apply the weights
    weights = true * weight_one + (1. - true) * weight_zero
    #weights /= (weight_one + weight_zero) # Normalizing to be more consistent with regular BCE for comparison 
    weighted_bin_crossentropy = weights * bin_crossentropy 

    return np.mean(weighted_bin_crossentropy)

In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #From logits to probabilities
    probs_2d = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    probs = probs_2d[:, 1]  # positive class extraction
    
    weigthted_bce = weighted_bincrossentropy(labels, probs)
    keras_bce = binary_crossentropy(labels, probs)
    keras_bce = float(np.mean(keras_bce.numpy()))  # Converting from keras eagertensor to float value
    
    # Wrapping all metrics to floats for json serialization during model eval
    return {
        'keras_BCE': keras_bce,
        'weighted BCE': weigthted_bce,
        'recall': float(recall_score(labels, probs.round())),
        'precision': float(average_precision_score(labels, probs)),
        'accuracy': float(accuracy_score(labels, probs.round())), # Need rounding for these two computations (integer required)
        'f1': float(f1_score(labels, probs.round(), average='macro')) # macro f1 is better for imbalanced dataset
    }

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Sigmoid for binary classification (BCE setup)
    if predictions.shape[1] == 1:
        probs = 1 / (1 + np.exp(-predictions)).flatten()
        print("This is the if statement")
    else:
        probs = np.exp(predictions[:, 1]) / np.exp(predictions).sum(axis=1)
        print("This is the else statement")
    
    weighted_bce = weighted_bincrossentropy(labels, probs)
    keras_bce = binary_crossentropy(labels, probs)
    keras_bce = float(np.mean(keras_bce.numpy()))
    
    preds = probs.round()
    return {
        'keras_BCE': keras_bce,
        'weighted_BCE': weighted_bce,
        'recall': float(recall_score(labels, preds)),
        'precision': float(precision_score(labels, preds)),
        'accuracy': float(accuracy_score(labels, preds)),
        'f1': float(f1_score(labels, preds, average='macro'))
    }


In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # For CrossEntropyLoss with 2 classes, apply softmax
    probs = np.exp(predictions) / np.exp(predictions).sum(axis=-1, keepdims=True)
    # Get probability of positive class
    positive_probs = probs[:, 1]
    
    # Use threshold for binary predictions
    preds = (positive_probs > 0.5).astype(int)
    
    return {
        'recall': float(recall_score(labels, preds)),
        'precision': float(precision_score(labels, preds)),
        'accuracy': float(accuracy_score(labels, preds)),
        'f1': float(f1_score(labels, preds)),
        'f1_weighted': float(f1_score(labels, preds, average='weighted'))
    }

In [15]:
# Claudes shot at trainer
train_labels = test_dataframe['label'].tolist()
class_counts = Counter(train_labels)
total = sum(class_counts.values())
weights = [total/(class_counts[0]*2), total/(class_counts[1]*2)]  # Normalized properly
class_weights = torch.tensor(weights, dtype=torch.float32)

#define custom trainer that uses weigted loss
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Define weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [16]:
# Claudes training args
training_args = TrainingArguments(
    report_to='wandb',
    output_dir='./test_tune_results',
    optim="paged_adamw_8bit",
    learning_rate=3e-5,  # Lower for LoRA
    num_train_epochs=5,  # More epochs with early stopping
    per_device_train_batch_size=32,  # Smaller batches
    gradient_accumulation_steps=4,  # Effective batch of 128
    eval_strategy="steps",
    eval_steps=50,  # Evaluate more frequently
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,  # CRITICAL: Load best checkpoint
    metric_for_best_model="f1",
    greater_is_better=True,
    bf16=True,  # Enable mixed precision
    fp16=False,
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    remove_unused_columns=True, # Avoiding manual handling of residual text columns
    max_grad_norm=1.0,
    disable_tqdm=False,
)


    

In [10]:
# Custom trainer class (weigthed)
from collections import Counter

labels = test_dataframe['label'].tolist()
class_counts = Counter(labels)
total = sum(class_counts.values())

# Higher weight = more emphasis
weights = [total/class_counts[0], total/class_counts[1]]
class_weights = torch.tensor(weights, dtype=torch.float)

#define custom trainer that uses weigted loss
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Define weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [8]:
val_dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/Model_data/validation_set.json")

val_dataframe = val_dataframe[['text', 'label']]

val_dataset = Dataset.from_pandas(val_dataframe)

tokenized_val = val_dataset.map(tokenize_function, batched=True, num_proc=16)

Map (num_proc=16): 100%|██████████| 258/258 [00:00<00:00, 496.55 examples/s]


In [9]:
test_dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/cleaned_training_data.json")

test_dataframe = test_dataframe[['text', 'label']]

test_dataframe = test_dataframe[0:5000]

test_dataframe['labels'] = test_dataframe['label']

test_dataframe = test_dataframe.sample(frac=1).reset_index(drop=True)

test_dataset = Dataset.from_pandas(test_dataframe)

tokenized_test = test_dataset.map(tokenize_function, batched=True, num_proc=16)

Map (num_proc=16): 100%|██████████| 5000/5000 [00:00<00:00, 8518.89 examples/s]


In [None]:
dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/preproc_data_for_tuning_final.json")

dataframe = dataframe[['text', 'label']]

dataset = Dataset.from_pandas(dataframe)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=16)

In [17]:
'''
Look into learning rates, model is currently overfitting quite drastically ("small" test-set)
Normalizing weigthed BCE or no?
Look into regularization, dropout and early stopping to avoid overfitting
'''

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_test,#dataset,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)


trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Recall,Precision,Accuracy,F1,F1 Weighted
50,No log,0.473964,0.852273,0.735294,0.844961,0.789474,0.847344
100,No log,0.572771,0.897727,0.705357,0.837209,0.79,0.840795
150,No log,0.615699,0.897727,0.724771,0.848837,0.80203,0.851919
200,No log,0.613369,0.875,0.712963,0.837209,0.785714,0.840428


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=200, training_loss=0.3469268417358398, metrics={'train_runtime': 399.474, 'train_samples_per_second': 62.582, 'train_steps_per_second': 0.501, 'total_flos': 8781302323200000.0, 'train_loss': 0.3469268417358398, 'epoch': 5.0})

In [None]:
# Load the model from the specific checkpoint
best_model = AutoModelForSequenceClassification.from_pretrained("./full_tune_results/checkpoint-3032")



# Save permanently to a new folder
best_model.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/best_model_epoch2")
tokenizer.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/best_model_epoch2")

In [None]:
from peft import PeftModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 1️⃣ Load the base model
base_model_name = "jhu-clsp/mmBERT-base"  # or whatever your base model is
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

# 2️⃣ Load the LoRA adapter from the 2nd-epoch checkpoint
adapter_checkpoint = "./full_tune_results/checkpoint-3032"
lora_model = PeftModel.from_pretrained(base_model, adapter_checkpoint)

# 3️⃣ Merge LoRA weights into the base model
merged_model = lora_model.merge_and_unload()

# 4️⃣ Save the merged model as a standalone Hugging Face model
save_dir = "/work/RuneEgeskovTrust#9638/Bachelor/merged_model_epoch2"
merged_model.save_pretrained(save_dir)

'''
# 5️⃣ Save tokenizer too
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
'''


In [None]:
trainable_params = [p for p in merged_model.parameters() if p.requires_grad]
print(f"Trainable parameters after merge: {len(trainable_params)}")


In [None]:
tokenizer.save_pretrained(save_dir)


In [None]:
FINE_TUNED_MODEL_NAME = "mmBlameBERT-pol-DA"

#merged_model = model.merge_and_unload()    # PEFT: incorporates LoRA into base weights
merged_dir = f"/work/RuneEgeskovTrust#9638/Bachelor/{FINE_TUNED_MODEL_NAME}-final-tune"
merged_model.save_pretrained(merged_dir)
#tokenizer.save_pretrained(merged_dir)
print("✓ Merged model saved to:", merged_dir)

In [None]:
trainer.save_model("/work/RuneEgeskovTrust#9638/Bachelor/LORA-adapters-for-final-tune")

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:

with open("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/Final_tune_eval_result.txt", "w") as f:
    f.write(str(eval_results))

In [None]:
def preprocess_json(input_path, output_path=None):
    """
    Preprocesses a JSON file by filtering out entries based on the 'text' key.
    
    Criteria for deletion:
      - 'text' is missing or empty
      - 'text' length is <= 3
      - 'text' contains '(' or ')'
    
    Parameters:
        input_path (str): Path to the input JSON file.
        output_path (str, optional): If provided, saves the filtered JSON here.
    
    Returns:
        list: The filtered list of JSON entries.
    """
    # Load JSON file
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Filter entries
    filtered_data = [
        entry for entry in data
        if 'text' in entry
        and entry['text']
        and len(entry['text']) > 3
        and '(' not in entry['text']
        and ')' not in entry['text']
    ]

    # Optionally save to a new file
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_data, f, ensure_ascii=False, indent=4)

    return filtered_data

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
preprocess_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/cleaned_training_data_3_4_5_temps.json",
"/work/RuneEgeskovTrust#9638/Bachelor/training_data/preproc_data_for_tuning_final.json")