To do:
- Have a look at learning rate and gradient norm clipping which I need to read up on.
    - Setting learning rate to 1e-4 from the "Embedding sweep" section of the mmBERT paper
    - Keeping gradient norm clipping to the default which caps it at 1.0

- Hyperparameter tuning (Alpha, learning rate, batch size so on - not sure how to figure this out)
    - There is precedence for no hyperparameter tuning from the author of the OG NLI model that DEBATE is based on = Due to computational restrains and the points from this paper, no hyperparameter tuning was performed in this case. The model tuning in itself is also not the primary focus in this paper, but simply serves as a tool for the actual inquiry into blame in the Danish Parliament



In [None]:
%pip install -r "requirements_bert.txt"

In [1]:
import torch
import transformers
import bitsandbytes
import accelerate
import datasets
import numpy as np
import pandas as pd
import keras
import json
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from sklearn.model_selection import train_test_split
from datasets import Dataset
from keras.losses import binary_crossentropy
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm
2025-10-29 11:04:05.965235: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-29 11:04:06.020558: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-29 11:04:07.202576: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
model_name = "jhu-clsp/mmBERT-base"

quantization_config = BitsAndBytesConfig(
                                        load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",  # Fine-tuning all linear (classification, attention... layers)
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,416,096 || all params: 310,947,874 || trainable%: 1.0986


In [4]:
training_args = TrainingArguments(
    report_to='wandb',
    output_dir='./test_tune_results',
    optim="paged_adamw_8bit",
    learning_rate=1e-4, # Learning rate copied from mmBERT paper on embedding sweep of LR (1e-4) as they found this to perform best
    num_train_epochs=3,
    per_device_train_batch_size=16, # Batching at 256 to balance generalization and efficient training
    gradient_accumulation_steps=16,  # Gradient of 1 as full batch fits in memory, accumulation then only slows
    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    bf16=True,  # Enable mixed precision
    fp16=False,
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    remove_unused_columns=True, # Avoiding manual handling of residual text columns
    max_grad_norm=1.0,
    disable_tqdm=False,
)

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], 
    padding="max_length", 
    truncation=True,
    max_length=512, # Padding to 512 to massively cut down on computation compared to base 8,192 tokens. 
    )

In [6]:
def weighted_bincrossentropy(true, pred, weight_zero = 1, weight_one = 1):
    """
    Calculates weighted binary cross entropy. The weights are fixed to represent class imbalance in the dataset.
        
    For example if there are 10x as many positive classes as negative classes,
        if you adjust weight_zero = 1.0, weight_one = 0.1, then false positives 
        will be penalized 10 times as much as false negatives.

    """
  
    # calculate the binary cross entropy
    bin_crossentropy = binary_crossentropy(true, pred)
    
    # apply the weights
    weights = true * weight_one + (1. - true) * weight_zero
    #weights /= (weight_one + weight_zero) # Normalizing to be more consistent with regular BCE for comparison 
    weighted_bin_crossentropy = weights * bin_crossentropy 

    return np.mean(weighted_bin_crossentropy)

In [17]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #From logits to probabilities
    probs_2d = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    #print(f'Probability 2 dimensions (0,1): {probs_2d}')
    probs = probs_2d[:, 1]  # positive class extraction
    #print(f'Probs 1 dimension (pos): {probs}')
    
    weigthted_bce = weighted_bincrossentropy(labels, probs)
    keras_bce = binary_crossentropy(labels, probs)
    keras_bce = float(np.mean(keras_bce.numpy()))  # Converting from keras eagertensor to float value
    
    # Wrapping all metrics to floats for json serialization during model eval
    return {
        'keras_BCE': keras_bce,
        'weighted BCE': weigthted_bce,
        'recall': float(recall_score(labels, probs.round())),
        'precision': float(average_precision_score(labels, probs)),
        'accuracy': float(accuracy_score(labels, probs.round())), # Need rounding for these two computations (integer required)
        'f1': float(f1_score(labels, probs.round(), average='macro')) # macro f1 is better for imbalanced dataset
    }

In [1]:
import pandas as pd

In [None]:
dataframe_5 = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/prepoc_data_for_tuning_final.json")

test_dataframe = dataframe_5[['text', 'label']]

'''
test_dataframe = test_dataframe[0:5000]

test_dataframe = test_dataframe.sample(frac=1).reset_index(drop=True)

test_dataset = Dataset.from_pandas(test_dataframe)

tokenized_test = test_dataset.map(tokenize_function, batched=True, num_proc=16)
'''

FileNotFoundError: File /work/RuneEgeskovTrust#9638/Bachelor/training_data/prepoc_data_for_tuning_final.json does not exist

In [6]:
dataframe_5 = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/preproc_data_for_tuning_final.json")

#test_dataframe = dataframe_5[['text', 'label']]

In [7]:
len(dataframe_5)

388027

In [11]:
dataframe = pd.read_csv("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/Corp_Folketing_V2.csv")

In [12]:
len(dataframe)

772180

In [17]:
dataframe[772160:]

Unnamed: 0.1,Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
772160,772160,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,315,Pia Kjærsgaard,,,True,22,Der er ikke stillet ændringsforslag. Er der n...,DK-Folketing,DNK
772161,772161,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,316,Pia Kjærsgaard,,,True,64,"Der stemmes om forslagets endelige vedtagelse,...",DK-Folketing,DNK
772162,772162,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,317,Pia Kjærsgaard,,,True,22,Der er ikke stillet ændringsforslag. Er der n...,DK-Folketing,DNK
772163,772163,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,318,Pia Kjærsgaard,,,True,41,"Der stemmes om forslagets endelige vedtagelse,...",DK-Folketing,DNK
772164,772164,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,319,Pia Kjærsgaard,,,True,6,"Hr. Holger K. Nielsen, værsgo. (Ordfører)",DK-Folketing,DNK
772165,772165,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,320,Holger K. Nielsen,SF,329.0,False,268,"Om lidt vil der være et flertal, som stemmer d...",DK-Folketing,DNK
772166,772166,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,321,Pia Kjærsgaard,,,True,16,"Tak. Er der flere, der ønsker ordet? Det er d...",DK-Folketing,DNK
772167,772167,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,322,Pia Kjærsgaard,,,True,37,Der stemmes om forslagets endelige vedtagelse....,DK-Folketing,DNK
772168,772168,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,323,Pia Kjærsgaard,,,True,11,Der er ikke stillet ændringsforslag. Værsgo t...,DK-Folketing,DNK
772169,772169,2018-12-20,2. (sidste) behandling af beslutningsforslag ...,324,Andreas Steenberg,RV,1507.0,False,325,"Siden 2016 er 62 børn blev udvist, selv om en ...",DK-Folketing,DNK


In [9]:
val_dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/Model_data/validation_set.json")

val_dataframe = val_dataframe[['text', 'label']]

val_dataset = Dataset.from_pandas(val_dataframe)

tokenized_val = val_dataset.map(tokenize_function, batched=True, num_proc=16)

Map (num_proc=16): 100%|██████████| 258/258 [00:00<00:00, 448.32 examples/s]


In [10]:
# Custom trainer class (weigthed)
from collections import Counter

labels = test_dataframe['label'].tolist()
class_counts = Counter(labels)
total = sum(class_counts.values())

# Higher weight = more emphasis
weights = [total/class_counts[0], total/class_counts[1]]
class_weights = torch.tensor(weights, dtype=torch.float)

#define custom trainer that uses weigted loss
import torch.nn as nn

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Define weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [None]:
dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/preprocessed_data_for_training.json")

dataframe = dataframe[['text', 'label']]

dataset = Dataset.from_pandas(dataframe)

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=16)

In [16]:
'''
Look into learning rates, model is currently overfitting quite drastically ("small" test-set)
Normalizing weigthed BCE or no?
Look into regularization, dropout and early stopping to avoid overfitting
'''

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_test,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)


trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Keras Bce,Weighted bce,Recall,Precision,Accuracy,F1
1,3.9057,0.418282,0.417131,0.417131,0.863636,0.803588,0.813953,0.804051
2,3.6461,0.451754,0.449625,0.449625,0.897727,0.816758,0.813953,0.806076
3,2.3056,0.443235,0.441623,0.441623,0.886364,0.807174,0.821705,0.81289


Probability 2 dimensions (0,1): [[1.79530576e-01 8.20469439e-01]
 [5.80827653e-01 4.19172347e-01]
 [1.89221263e-01 8.10778737e-01]
 [3.78459096e-01 6.21540904e-01]
 [1.05948299e-01 8.94051671e-01]
 [7.66643286e-01 2.33356774e-01]
 [2.50912815e-01 7.49087214e-01]
 [2.99924761e-01 7.00075209e-01]
 [4.63782027e-02 9.53621805e-01]
 [8.19893301e-01 1.80106655e-01]
 [1.05210543e-01 8.94789517e-01]
 [1.38636634e-01 8.61363411e-01]
 [9.13058341e-01 8.69416445e-02]
 [1.44873232e-01 8.55126739e-01]
 [9.21816289e-01 7.81836733e-02]
 [9.84254107e-02 9.01574552e-01]
 [4.09990288e-02 9.59001005e-01]
 [7.10736662e-02 9.28926289e-01]
 [2.18335882e-01 7.81664133e-01]
 [1.09515160e-01 8.90484869e-01]
 [8.24814975e-01 1.75185025e-01]
 [1.98064163e-01 8.01935852e-01]
 [2.16011539e-01 7.83988476e-01]
 [2.30573058e-01 7.69427001e-01]
 [6.09988928e-01 3.90011042e-01]
 [4.54714298e-01 5.45285702e-01]
 [9.29376304e-01 7.06236586e-02]
 [1.15081510e-02 9.88491833e-01]
 [1.86243027e-01 8.13757002e-01]
 [1.3477593

  return fn(*args, **kwargs)


Probability 2 dimensions (0,1): [[7.75176585e-02 9.22482312e-01]
 [4.57136661e-01 5.42863309e-01]
 [9.26877782e-02 9.07312214e-01]
 [1.83301061e-01 8.16698968e-01]
 [4.46808673e-02 9.55319107e-01]
 [7.97556758e-01 2.02443182e-01]
 [1.85356632e-01 8.14643383e-01]
 [2.03707546e-01 7.96292484e-01]
 [2.21165903e-02 9.77883458e-01]
 [8.68826747e-01 1.31173223e-01]
 [4.30026464e-02 9.56997335e-01]
 [6.23310357e-02 9.37668979e-01]
 [9.43347573e-01 5.66524304e-02]
 [7.49052763e-02 9.25094783e-01]
 [8.99541318e-01 1.00458667e-01]
 [3.34629342e-02 9.66537058e-01]
 [2.08044890e-02 9.79195476e-01]
 [5.62363416e-02 9.43763673e-01]
 [1.14368513e-01 8.85631502e-01]
 [4.76026572e-02 9.52397346e-01]
 [7.68602967e-01 2.31397077e-01]
 [1.17977999e-01 8.82022023e-01]
 [9.43434164e-02 9.05656576e-01]
 [1.60003364e-01 8.39996636e-01]
 [5.66977859e-01 4.33022201e-01]
 [3.68405670e-01 6.31594300e-01]
 [9.49716032e-01 5.02839833e-02]
 [5.46927471e-03 9.94530797e-01]
 [1.59217417e-01 8.40782583e-01]
 [5.7071398

  return fn(*args, **kwargs)


Probability 2 dimensions (0,1): [[9.00929868e-02 9.09906983e-01]
 [4.81453836e-01 5.18546164e-01]
 [1.08188957e-01 8.91811073e-01]
 [2.09469691e-01 7.90530324e-01]
 [3.78924236e-02 9.62107599e-01]
 [8.73862088e-01 1.26137897e-01]
 [2.25416631e-01 7.74583399e-01]
 [2.29535222e-01 7.70464778e-01]
 [2.06459332e-02 9.79354084e-01]
 [9.43341076e-01 5.66589572e-02]
 [3.82500626e-02 9.61749971e-01]
 [7.72387832e-02 9.22761202e-01]
 [9.58537757e-01 4.14622724e-02]
 [6.50087371e-02 9.34991241e-01]
 [9.49200571e-01 5.07994629e-02]
 [4.06167768e-02 9.59383249e-01]
 [1.74424816e-02 9.82557476e-01]
 [6.63268715e-02 9.33673084e-01]
 [9.80793163e-02 9.01920617e-01]
 [5.67568950e-02 9.43243086e-01]
 [7.54191518e-01 2.45808482e-01]
 [1.61320239e-01 8.38679731e-01]
 [9.53494608e-02 9.04650509e-01]
 [1.81553245e-01 8.18446696e-01]
 [5.37284195e-01 4.62715834e-01]
 [4.82063383e-01 5.17936647e-01]
 [9.64153051e-01 3.58469225e-02]
 [4.07013809e-03 9.95929897e-01]
 [2.07214713e-01 7.92785347e-01]
 [4.6378202

TrainOutput(global_step=42, training_loss=3.8520758606138683, metrics={'train_runtime': 149.606, 'train_samples_per_second': 71.187, 'train_steps_per_second': 0.281, 'total_flos': 3740834789683200.0, 'train_loss': 3.8520758606138683, 'epoch': 3.0})

In [None]:
trainer.save_model("/work/RuneEgeskovTrust#9638/Bachelor/test_lora_layers")

In [18]:
state_dictionary = model.state_dict()

In [19]:
state_dictionary

OrderedDict([('base_model.model.model.embeddings.tok_embeddings.weight',
              tensor([[ 0.0043, -0.0012,  0.0032,  ...,  0.0077,  0.0092,  0.0135],
                      [-0.0140,  0.0028, -0.0312,  ..., -0.0204, -0.0220, -0.0178],
                      [-0.0082,  0.0160,  0.0003,  ...,  0.0471, -0.0361,  0.0025],
                      ...,
                      [-0.0059, -0.0200, -0.0503,  ...,  0.0052,  0.0515,  0.0308],
                      [-0.0009, -0.0121, -0.0082,  ...,  0.0562,  0.0211,  0.0184],
                      [ 0.0043, -0.0012,  0.0032,  ...,  0.0077,  0.0092,  0.0135]],
                     device='cuda:0')),
             ('base_model.model.model.embeddings.norm.weight',
              tensor([ 1.5547e+00,  4.7266e-01,  5.0391e-01,  5.0781e-01,  5.0000e-01,
                       5.1172e-01,  4.5508e-01,  1.3906e+00,  6.0547e-01,  4.9219e-01,
                       6.4844e-01,  4.1602e-01,  9.8047e-01,  1.4160e-01,  1.5000e+00,
                       5.6641e-

In [16]:
print(merged_model)

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(256000, 768, padding_idx=0)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      )
 

In [15]:
print(model)

PeftModel(
  (base_model): LoraModel(
    (model): ModernBertForSequenceClassification(
      (model): ModernBertModel(
        (embeddings): ModernBertEmbeddings(
          (tok_embeddings): Embedding(256000, 768, padding_idx=0)
          (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (layers): ModuleList(
          (0): ModernBertEncoderLayer(
            (attn_norm): Identity()
            (attn): ModernBertAttention(
              (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
              (rotary_emb): ModernBertRotaryEmbedding()
              (Wo): Linear(in_features=768, out_features=768, bias=False)
              (out_drop): Identity()
            )
            (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): ModernBertMLP(
              (Wi): Linear(in_features=768, out_features=2304, bias=False)
              (act): GELUActivation()
           

In [17]:
trainer_model = trainer.model

In [18]:
print(trainer_model)

PeftModel(
  (base_model): LoraModel(
    (model): ModernBertForSequenceClassification(
      (model): ModernBertModel(
        (embeddings): ModernBertEmbeddings(
          (tok_embeddings): Embedding(256000, 768, padding_idx=0)
          (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (layers): ModuleList(
          (0): ModernBertEncoderLayer(
            (attn_norm): Identity()
            (attn): ModernBertAttention(
              (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
              (rotary_emb): ModernBertRotaryEmbedding()
              (Wo): Linear(in_features=768, out_features=768, bias=False)
              (out_drop): Identity()
            )
            (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): ModernBertMLP(
              (Wi): Linear(in_features=768, out_features=2304, bias=False)
              (act): GELUActivation()
           

In [14]:
model.eval()

model_float16 = model.to(dtype=torch.bfloat16)  # or torch.float16 / torch.float32

merged_model = model_float16.merge_and_unload()  # merges LoRA deltas into base weights
merged_model.dequantize()  # <- Before or after merging - what actually works with no error?
# after training

merged_model.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/test_merged")
tokenizer.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/test_merged")


The model is going to be dequantized in torch.bfloat16 - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`


('/work/RuneEgeskovTrust#9638/Bachelor/test_merged/tokenizer_config.json',
 '/work/RuneEgeskovTrust#9638/Bachelor/test_merged/special_tokens_map.json',
 '/work/RuneEgeskovTrust#9638/Bachelor/test_merged/tokenizer.json')

In [None]:
print(merged_model)

In [None]:
# after training
model.eval()
merged_model = model.merge_and_unload()  # merges LoRA deltas into base weights
merged_model.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/test_merged")
tokenizer.save_pretrained("/work/RuneEgeskovTrust#9638/Bachelor/test_merged")


In [None]:
print(model)

In [None]:
print(merged_model)

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:

with open("/work/RuneEgeskovTrust#9638/Bachelor/Bachelor_project/EvalResultFullData.txt", "w") as f:
    f.write(str(eval_results))

In [None]:
def preprocess_json(input_path, output_path=None):
    """
    Preprocesses a JSON file by filtering out entries based on the 'text' key.
    
    Criteria for deletion:
      - 'text' is missing or empty
      - 'text' length is <= 3
      - 'text' contains '(' or ')'
    
    Parameters:
        input_path (str): Path to the input JSON file.
        output_path (str, optional): If provided, saves the filtered JSON here.
    
    Returns:
        list: The filtered list of JSON entries.
    """
    # Load JSON file
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Filter entries
    filtered_data = [
        entry for entry in data
        if 'text' in entry
        and entry['text']
        and len(entry['text']) > 3
        and '(' not in entry['text']
        and ')' not in entry['text']
    ]

    # Optionally save to a new file
    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(filtered_data, f, ensure_ascii=False, indent=4)

    return filtered_data

In [None]:
state_in_mem = model.state_dict()


# After reloading
base = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    dtype=torch.bfloat16,
)
reloaded = PeftModel.from_pretrained(base, "/work/RuneEgeskovTrust#9638/Bachelor/test_lora_layers")
state_reloaded = reloaded.state_dict()



In [None]:
# Compare numerically - putput here confirms a mismatch, most likely the newly initialized quantization
# at load is caused the issue as this means adapters don't attach correctly to the base model
for k in state_in_mem.keys():
    if torch.any(state_in_mem[k] != state_reloaded[k]):
        print("Mismatch:", k)
        break

In [None]:
del model
torch.cuda.empty_cache()

In [None]:
preprocess_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/subset_1_2_3_4_5_cleaned_training_data.json",
 "/work/RuneEgeskovTrust#9638/Bachelor/training_data/preproc_subset_1_2_3_4_5_cleaned_training_data.json")