In [2]:
%pip install -r "requirements_bert.txt"

Collecting peft (from -r requirements_bert.txt (line 3))
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes (from -r requirements_bert.txt (line 4))
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets (from -r requirements_bert.txt (line 6))
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting scikit-learn (from -r requirements_bert.txt (line 7))
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting pyarrow>=21.0.0 (from datasets->-r requirements_bert.txt (line 6))
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets->-r requirements_bert.txt (line 6))
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting httpx<1.0.0 (from datasets->-r requirements_bert.txt (line 6))
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1

In [1]:
import torch
import transformers
from peft import LoraConfig, get_peft_model, TaskType
import bitsandbytes
import accelerate
import datasets
#import scikit-learn
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import Conv1D, AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, AutoModelForCausalLM
from sklearn.model_selection import train_test_split
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "jhu-clsp/mmBERT-base"

quantization_config = BitsAndBytesConfig(
                                        load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16,
                                         bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True,
                                         )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
'''
Chunk for looking into trainable layers of the model itself.
'''

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])

    return layer_names

list(set(get_specific_layer_names(model)))


In [3]:
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["Wqkv"],  # Fine-tuning the attention layer specifically
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 540,672 || all params: 308,072,450 || trainable%: 0.1755


In [4]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=1,  # Start small, increase gradually
    gradient_accumulation_steps=12,  # Simulate larger batch size

    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    max_grad_norm=1.0,

    disable_tqdm=False,
)

In [6]:
'''
Dataset generation chunk
We need to pass it through the BERT tokenizer here, make a train / test / val split and pass that to the model

Below is the structure which worked for the Pol_NLI dataset, we should strive to do the same
'''
dataframe = pd.read_json("/work/RuneEgeskovTrust#9638/Bachelor/training_data/training_data.json")

#tokenized_dataset = tokenized_dataset.rename_column("entailment", "labels") # Rename entailment column to labels (which is standard lookup for evaluation in the transmformers trainer)

In [30]:
dataframe["label"][1900]

np.int64(1)

In [None]:
dataset = dataframe[0:5000]

random_dataset = dataset.sample(n=2000, axis=0, random_state=40)

X = dataset["text"]
y = dataset["label"]

X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

np.int64(0)

In [45]:
random_dataset.head()

Unnamed: 0,paragraph,sentence_nr,text,speaker,party,preceding_sentence,succeeding_sent,current_speaker_in_government,parties_in_government,date,label
4088,94,5,Så er fordelen ved retoriske spørgsmål selvføl...,Søren Søndergaard,EL,"Man vil både have i pose og sæk, og man vil i...","Når en eller anden narkoman begår et indbrud,...",False,"[S, RV]",1997-10-09,0
2080,2,106,"Vi har vist, at et flittigt folk er at finde ...",Poul Nyrup Rasmussen,S,"Danmark er i fremgang, og håbet og troen på f...",Regeringen har gennem sin økonomiske politik -...,True,"[S, RV]",1997-10-07,0
41,183,97,"De folk, der står bagest, er taberne.",Poul Nyrup Rasmussen,S,"Taberne er de folk, der er dårligst uddannede...","Jeg siger det for god ordens skyld, for det er...",True,"[S, RV]",1997-10-09,1
796,13630,3,"Men det skal også siges, at regeringens liv h...",Holger K. Nielsen,SF,"Folketingsvalget var udtryk for en kamp om, hv...","Og det bør regeringen have i baghovedet, når d...",False,"[S, RV]",1998-06-25,1
354,6487,25,"Jeg mener, at det er de i ånden svage dansker...",Arne Melchior,CD,I øvrigt interesserer jeg mig slet ikke så meg...,"Der har heldigvis været en række ordførere, d...",False,"[S, RV]",1998-01-22,1


In [46]:
# From dataframe to dataset for mapping tokenizer function 
dataset = Dataset.from_pandas(random_dataset)

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function)

Map: 100%|██████████| 2000/2000 [00:04<00:00, 415.12 examples/s]


In [19]:
dataset["text"]

Column(['Regeringen vil også fortsætte sin offensive  miljøpolitik.', 'Sådan som debatten var foregået, sagde hr. Torben Lund,  var det lavt og nedrigt og pinligt.', 'Jeg vil tillade mig at  sige, at det er lavt og nedrigt og pinligt af den siddende  regering, at den overhovedet ikke har taget debatten op før  nu, når der er et folketingsvalg og et kommunevalg i sigte.', 'Det er oprigtig talt dybt beskæmmende, at Socialdemokratiets  ordfører kan prøve på at bortforklare de internationale  sammenligninger, der har været af skoleelevers kundskaber med  hensyn til læsning, regning, fysik og kemi.', 'Først vil jeg sige til hr. Helge Adam Møller, der jo her  havde en kort bemærkning, der svarede til, tror jeg, næsten  ordret de korte bemærkninger, hr. Helge Adam Møller har haft  til åbningsdebatter og afslutningsdebatter i hvert fald i de  år, jeg har været politisk ordfører, og de går på, at De  Konservative vil hårdere straffe og mere fængsel, og at vi  andre bare er sådan nogle slatne no

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro')
    }

In [None]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)


trainer.train()

In [None]:
# This is where we should very much remember to save the finetuned model locally as this contains the new weights for use in analyzing new text
lora_model.save_pretrained(f"output/mmBERT/{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}")