# Fine Tune reward model from scratch

# TODOs:

#TODO: double-check that labels are not somehow misaligned...

#TODO: check if you need to plot 

1. LoRA learns the position of the low rank adaptation matrix that is needed to finetune a model of a much higher rank

#TODO: double check model performance, generate output, maybe adjust training metrics

## 1. Imports, setup, and global variables

In [1]:
import torch
import pandas as pd
import os
import sys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..')))

from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from collections import Counter

from datasets import Dataset, DatasetDict, load_from_disk

from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig

from utils import parse_ratings, tokenize_fn_with_best_window, tokenize_fn_basic_batched, CustomRewardTrainer, find_best_window, convert_label_to_int


# from nltk.tokenize import sent_tokenize

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
# load training variables
FEEDBACK_TO_TRAIN_ON = os.getenv("FEEDBACK_TO_TRAIN_ON")
FEEDBACK_TO_REMOVE = os.getenv("FEEDBACK_TO_REMOVE")
MODEL = os.getenv("REWARD_MODEL")
DATASET = os.getenv("REWARD_DATASET")
TOKENIZE_FN = os.getenv("TOKENIZE_FN")
MAX_LENGTH = os.getenv("MAX_LENGTH")
STRIDE = os.getenv("STRIDE")
LORA_CHECKPOINTS_FOLDER = os.getenv("LORA_CHECKPOINTS_FOLDER")

#TODO: change this to not store model since contains /!!!
FINAL_LORA_ADAPTERS = os.getenv("FINAL_LORA_ADAPTERS_FOLDER") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}"
TOKENIZED_DATA_TRAIN = os.getenv("TOKENIZED_DATA") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}_train"
TOKENIZED_DATA_EVAL = os.getenv("TOKENIZED_DATA") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}_eval"
TOKENIZED_DATA_TEST = os.getenv("TOKENIZED_DATA") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}_test"
DATASET_STRUCTURE = os.getenv("DATASET_STRUCTURE")

REWARD_DATA_PATH = os.getenv("REWARD_DATA_PATH")

if DATASET_STRUCTURE == "determined":
    REWARD_MODEL_TRAIN_DATA_HUMAN = REWARD_DATA_PATH + "/train_human_determined.csv"
    REWARD_MODEL_EVAL_DATA_HUMAN = REWARD_DATA_PATH + "/validation_human_determined.csv"
    REWARD_MODEL_TEST_DATA_HUMAN = REWARD_DATA_PATH + "/test_human_determined.csv"

    REWARD_MODEL_TRAIN_DATA_SYNTH = REWARD_DATA_PATH + "/train_synth_determined.csv"
    REWARD_MODEL_EVAL_DATA_SYNTH = REWARD_DATA_PATH + "/validation_synth_determined.csv"
    REWARD_MODEL_TEST_DATA_SYNTH = REWARD_DATA_PATH + "/test_synth_determined.csv"

elif DATASET_STRUCTURE == "random":
    REWARD_MODEL_TRAIN_DATA_HUMAN = REWARD_DATA_PATH + "/train_human_random.csv"
    REWARD_MODEL_EVAL_DATA_HUMAN = REWARD_DATA_PATH + "/validation_human_random.csv"
    REWARD_MODEL_TEST_DATA_HUMAN = REWARD_DATA_PATH + "/test_human_random.csv"

    REWARD_MODEL_TRAIN_DATA_SYNTH = REWARD_DATA_PATH + "/train_synth_random.csv"
    REWARD_MODEL_EVAL_DATA_SYNTH = REWARD_DATA_PATH + "/validation_synth_random.csv"
    REWARD_MODEL_TEST_DATA_SYNTH = REWARD_DATA_PATH + "/test_synth_random.csv"

## 2. Dataset loading and preprocessing

#### Re-structure df synthetic to fit in training loop

In [3]:
if DATASET == "human":
    df_train = pd.read_csv(REWARD_MODEL_TRAIN_DATA_HUMAN, sep=";")
    df_eval = pd.read_csv(REWARD_MODEL_EVAL_DATA_HUMAN, sep=";")
    df_test = pd.read_csv(REWARD_MODEL_TEST_DATA_HUMAN, sep=";")
elif DATASET == "synthetic":
    df_train = pd.read_csv(REWARD_MODEL_TRAIN_DATA_SYNTH, sep=";")
    df_eval = pd.read_csv(REWARD_MODEL_EVAL_DATA_SYNTH, sep=";")
    df_test = pd.read_csv(REWARD_MODEL_TEST_DATA_SYNTH, sep=";")
    
    
df_train.shape
df_train.columns

Index(['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id',
       'precondition_text', 'precondition_position', 'response_text',
       'prompt_config_examples', 'prompt_config_chain_of_thought',
       'feedback_extraction', 'feedback_detection', 'additional_feedback',
       'synthetic_feedback'],
      dtype='object')

### 2. a) Parse ratings to numeric values for MSE Loss

In [4]:
df_train[FEEDBACK_TO_TRAIN_ON] = [parse_ratings(feedback) for feedback in df_train[FEEDBACK_TO_TRAIN_ON]]
df_eval[FEEDBACK_TO_TRAIN_ON] = [parse_ratings(feedback) for feedback in df_eval[FEEDBACK_TO_TRAIN_ON]]
df_test[FEEDBACK_TO_TRAIN_ON] = [parse_ratings(feedback) for feedback in df_test[FEEDBACK_TO_TRAIN_ON]]
print("Parsed feedback for extraction:", df_train[FEEDBACK_TO_TRAIN_ON][:5])

Parsed feedback for extraction: 0    0
1    1
2    1
3    1
4    0
Name: feedback_extraction, dtype: object


### 2. b) look at biases in feedback to train on for weights in RL loop --> feedback_detection is very biased through way it was collected, so gets less weight overall...

In [5]:
df_train[FEEDBACK_TO_TRAIN_ON].value_counts()

feedback_extraction
0    265
2     63
1     35
3     31
Name: count, dtype: int64

### 2. c) keep only relevant feedback column

In [6]:
dataset_train = Dataset.from_pandas(df_train)
dataset_eval = Dataset.from_pandas(df_eval)
dataset_test = Dataset.from_pandas(df_test)

print(dataset_train)
print(FEEDBACK_TO_TRAIN_ON) 

datasets = [dataset_train, dataset_eval, dataset_test]

Dataset({
    features: ['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id', 'precondition_text', 'precondition_position', 'response_text', 'prompt_config_examples', 'prompt_config_chain_of_thought', 'feedback_extraction', 'feedback_detection', 'additional_feedback', 'synthetic_feedback'],
    num_rows: 394
})
feedback_extraction


In [7]:
datasets= [dataset.remove_columns([FEEDBACK_TO_REMOVE]) for dataset in datasets]
datasets = [dataset.rename_column(FEEDBACK_TO_TRAIN_ON, "label") for dataset in datasets]

print(datasets[0]["label"])

['0', '1', '1', '1', '0', '0', '0', '0', '3', '0', '0', '1', '0', '1', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '3', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '2', '0', '0', '2', '0', '0', '2', '2', '0', '0', '0', '0', '3', '0', '2', '2', '0', '0', '0', '0', '1', '1', '0', '0', '0', '1', '0', '0', '0', '3', '0', '0', '2', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '0', '0', '0', '0', '2', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '2', '2', '0', '0', '3', '0', '3', '2', '2', '0', '0', '0', '0', '0', '0', '3', '2', '0', '0', '1', '0', '0', '2', '0', '2', '3', '0', '3', '0', '2', '0', '0', '2', '0', '0', '0', '0', '1', '2', '0', '2', '0', '2', '0', '0', '2', '2', '0', '0', '0', '1', '1', '0', '0', '0', '2', '1', '0', '2', '0', '1', '2', '0', '1', '0', '0', '3', '0', '0', '2', '2', '0', '0', '0', '3', '0', '3', '0', '0', '0', '3', '0', '0', '3', '0', '0',

## 3. Load model with LoRA layer

In [8]:
# Load the model and the tokenizer
model_id = MODEL 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1) # num_labels = 1 since we want to prodict a single scalar (the rating)

# Comment: Automodel for sequence classification with num_labels=1 already has a regression head
print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
print(tokenizer.model_max_length)
print(model.config.max_position_embeddings)

512
512


In [10]:
# Define LoRA config


if MODEL == "answerdotai/ModernBERT-base":

    lora_config = LoraConfig(
    r=8,           # Rank of the LoRA matrices (smaller = less memory)
    lora_alpha=16, # Scaling factor (higher = stronger adaptation)
    target_modules=["Wqkv", "Wo"], # Apply LoRA to attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # classify each answer 
    )
else:
    lora_config = LoraConfig(
    r=8,           # Rank of the LoRA matrices (smaller = less memory)
    lora_alpha=16, # Scaling factor (higher = stronger adaptation)
    target_modules=["query", "key", "value"], # Apply LoRA to attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # classify each answer 
    )
    

# Freeze base model
for param in model.base_model.parameters():
    param.requires_grad = False



# Convert the model to a PEFT (LoRA) model
peft_model = get_peft_model(model, lora_config)
# model.gradient_checkpointing_enable()
peft_model.print_trainable_parameters()  # Check trainable params (~0.1% of full model)


trainable params: 443,137 || all params: 109,926,146 || trainable%: 0.4031


In [11]:
# Test tokenizer
sample_data = ["What is the capital of France?", "What is the largest capital in the world?"]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 1067, 223, 207, 580, 210, 1335, 124, 102, 0, 0, 0], [101, 1067, 223, 207, 5601, 190, 580, 213, 207, 1727, 124, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## 5. Encode dataset

In [12]:
print(datasets[0].column_names)
# mao string labels to integers
datasets = [dataset.map(convert_label_to_int) for dataset in datasets]

print(datasets[0]["label"][:5])  # Check labels
print(datasets[0]["response_text"][:5])  # Check labels

['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id', 'precondition_text', 'precondition_position', 'response_text', 'prompt_config_examples', 'prompt_config_chain_of_thought', 'label', 'additional_feedback', 'synthetic_feedback']


Map: 100%|██████████| 394/394 [00:00<00:00, 11158.53 examples/s]
Map: 100%|██████████| 85/85 [00:00<00:00, 7656.30 examples/s]
Map: 100%|██████████| 85/85 [00:00<00:00, 9308.99 examples/s]

[0, 1, 1, 1, 0]
['1. Het college stelt het recht op bijstand ambtshalve vast indien:\n                   - het in aanmerking te nemen inkomen lager is dan de bijstandsnorm; en\n                   - er geen in aanmerking te nemen vermogen is.\n                   Positie: Artikel 19, sectie 1, IN Hoofdstuk 3, Algemene bijstand, § 3.1, Wet op de Bijstand aan de arbeidsongeschikte, de arbeidsongeschikte zieke en de arbeidsongeschikte verzorgende.\n\n                2. De belanghebbende heeft zich gemeld als zijn naam, adres en woonplaats bij het Uitvoeringsinstituut werknemersverzekeringen zijn geregistreerd, en:\n                   - indien artikel 41, vierde lid, van toepassing is: hij door het Uitvoeringsinstituut werknemersverzekeringen op de hoogte is gesteld van de verplichting, bedoeld in artikel 9, eerste lid, onderdeel a, en de inhoud van artikel 41;\n                   - indien artikel 41, vierde lid, niet van toepassing is: hij in staat is gesteld zijn aanvraag in te dienen bij 




## Comment

1. Needed for feedback extraction: precondition_text, response_text, label(rating feedback extraction)
2. Needed for feedback detection: precondition_text, precondition_position, response_text, label (rating feedback detection)
3. For the precondition position to be found well, it is a crucial for the model to find the precondition text (at least to a recognizable degree) as well, otherwise the precondition is not found at all...

In [13]:
# Code to test bestw indow function

test_text = """
        Titel: De Weg Door Het Leven

Het leven is een reis vol onverwachte wendingen, een pad dat zich zelden rechtlijnig ontvouwt. Vanaf het moment dat we onze eerste ademhaling nemen, worden we ondergedompeld in een wereld die we nog moeten leren begrijpen. Als kind lijkt alles eenvoudig: lachen, spelen, ontdekken. Maar naarmate we ouder worden, beginnen de lagen van complexiteit zich op te stapelen. We leren dat mensen niet altijd zeggen wat ze bedoelen, dat keuzes consequenties hebben, en dat geluk soms vluchtiger is dan we zouden willen.

In de vroege ochtenden, wanneer de zon net boven de horizon verschijnt en de wereld nog stil is, denken velen na over hun plaats in het grotere geheel. Sommigen vragen zich af of ze de juiste keuzes hebben gemaakt, of ze trouw zijn gebleven aan zichzelf. Anderen proberen simpelweg de dag door te komen, met hoop op iets beters. In die momenten van stilte komt vaak het besef dat, hoewel we allemaal verschillende paden bewandelen, we één waarheid delen: dat het leven, ondanks al onze inspanningen en verlangens, nooit gemakkelijk is. Of, zoals mijn grootmoeder het ooit zei terwijl ze haar handen vouwde na een lange dag werken op het land: “Je moet weten, kind, het leven is nooit gemakkelijk, maar het is wel de moeite waard.”

We worden gevormd door onze ervaringen, door de mensen die we ontmoeten en de obstakels die we overwinnen. Elke fout, elk succes, elke traan en elke glimlach draagt bij aan wie we zijn. En toch, ondanks al die ervaringen, blijven we zoeken. Naar betekenis. Naar verbinding. Naar rust.

Soms lijkt het alsof de wereld te snel draait. Technologie verandert ons leven in een razend tempo, verwachtingen worden hoger, en de druk om te presteren neemt toe. In die chaos vergeten we soms stil te staan. Te ademen. Te voelen. Maar juist in die momenten van rust vinden we vaak de antwoorden die we zo hard nodig hebben.

De liefde, bijvoorbeeld, is een van de krachtigste krachten die ons voortdrijft. Liefde voor een partner, een kind, een vriend, of zelfs voor een passie. Het is die liefde die ons helpt vol te houden wanneer alles tegenzit. Die ons eraan herinnert waarom we begonnen zijn, waarom we blijven proberen.

En dan is er verlies. Een onvermijdelijk onderdeel van het leven. We verliezen mensen, kansen, dromen. Maar in dat verlies schuilt ook groei. We leren loslaten, opnieuw beginnen, sterker worden. Het is pijnlijk, ja, maar ook noodzakelijk.

Wanneer we terugkijken op ons leven, zijn het zelden de materiële zaken die we herinneren. Het zijn de momenten. De gesprekken bij kaarslicht. De wandelingen in de regen. De onverwachte lachbuien. De stilte van een gedeeld verdriet. Die momenten vormen de essentie van ons bestaan.

Dus ja, het leven is vol uitdagingen. Het is rommelig, verwarrend, soms oneerlijk. Maar het is ook prachtig, rijk aan betekenis, en gevuld met kansen om te groeien, te leren en lief te hebben. En misschien is dat wel de grootste les van allemaal: dat we, ondanks alles, blijven kiezen voor hoop. Voor verbinding. Voor het leven zelf.
        """


test_ground_truth = "Het leven is nooit gemakkelijk."

print(find_best_window(test_text, test_ground_truth, device, tokenizer))

# Works as expectd, I am impressed.

Token indices sequence length is longer than the specified maximum sequence length for this model (1127 > 512). Running this sequence through the model will result in indexing errors


de juiste keuzes hebben gemaakt, of ze trouw zijn gebleven aan zichzelf. anderen proberen simpelweg de dag door te komen, met hoop op iets beters. in die momenten van stilte komt vaak het besef dat, hoewel we allemaal verschillende paden bewandelen, we een waarheid delen : dat het leven, ondanks al onze inspanningen en verlangens, nooit gemakkelijk is. of, zoals mijn grootmoeder het ooit zei terwijl ze haar handen vouwde na een lange dag werken op het land : [UNK] je moet weten, kind, het leven is nooit gemakkelijk, maar het is wel de moeite waard. [UNK] we worden gevormd door onze ervaringen, door de mensen die we ontmoeten en de obstakels die we overwinnen. elke fout, elk succes, elke traan en elke glimlach draagt bij aan wie we zijn. en toch, ondanks al die ervaringen, blijven we zoeken. naar betekenis. naar verbinding. naar rust. soms lijkt het alsof de wereld te snel draait. technologie verandert ons leven in een razend tempo, verwachtingen worden hoger, en de druk om te presteren

In [14]:
if not os.path.exists(TOKENIZED_DATA_TRAIN) or True:
    if TOKENIZE_FN == "best_window":
        datasets = [dataset.map(tokenize_fn_with_best_window, 
                            fn_kwargs={"feedback_train": FEEDBACK_TO_TRAIN_ON, 
                                        "tokenizer": tokenizer, 
                                        "max_length": int(MAX_LENGTH), 
                                        "stride": int(STRIDE),
                                        "device": device
                                        },
                            batched=False) for dataset in datasets]
    else:
        datasets = [dataset.map(tokenize_fn_basic_batched, 
                            fn_kwargs={"feedback_train": FEEDBACK_TO_TRAIN_ON, 
                                        "tokenizer": tokenizer 
                                        },
                            batched=True) for dataset in datasets]


    datasets[0].save_to_disk(TOKENIZED_DATA_TRAIN)
    datasets[1].save_to_disk(TOKENIZED_DATA_EVAL)
    datasets[2].save_to_disk(TOKENIZED_DATA_TEST)
else:
    datasets[0] = load_from_disk(TOKENIZED_DATA_TRAIN)
    datasets[1] = load_from_disk(TOKENIZED_DATA_TEST)
    datasets[2] = load_from_disk(TOKENIZED_DATA_EVAL)

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

Map: 100%|██████████| 394/394 [05:43<00:00,  1.15 examples/s]
Map: 100%|██████████| 85/85 [01:09<00:00,  1.21 examples/s]
Map: 100%|██████████| 85/85 [01:18<00:00,  1.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 394/394 [00:00<00:00, 55647.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 19809.74 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 85/85 [00:00<00:00, 22312.92 examples/s]


In [15]:
print(Counter(datasets[0]['file']))

Counter({'Participatiewet_most_recent_public.json': 140, 'Interpretatie_Vw_over_besluiten_op_aanvragen_voor_een_verblijfsvergunning_regulier_bepaalde_tijd.json': 134, 'rijksbegrotingscyclus.json': 120})


## 6. Train reward model

In [16]:
# Training arguments

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)


#TODO: switch to cross entropy loss...
training_args = TrainingArguments(
    output_dir=LORA_CHECKPOINTS_FOLDER,
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=40,
    eval_steps=40,
    save_total_limit=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-4,
    num_train_epochs=10,
    logging_steps=10,
    label_names=["labels"],
    # report_to="none",
    logging_dir="./logs",
    # fp16=True,  # Use mixed precision training
    metric_for_best_model="eval_loss", # or "eval_loss"
    greater_is_better=False, # False if using loss
    # gradient_accumulation_steps=4, # 
    # torch_compile=False
    # weight_decay=0.01
    warmup_steps=82, 
)

# Initialize custom trainer
trainer = CustomRewardTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=datasets[0],
    eval_dataset=datasets[1],
    # compute_metrics=trainer.compute_metrics,  # Use the custom metrics function
    processing_class=tokenizer,
    loss_type="huber",  # "mse" or "huber"
    weight_strategy="linear",  # "linear", "inverse", or None
    data_collator=data_collator,
)

print(trainer.args.device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


cuda:0


In [17]:
# if not os.path.exists(FINAL_LORA_ADAPTERS):
# train model
trainer.train()
# # store final model parameters
peft_model.save_pretrained(FINAL_LORA_ADAPTERS)

# #TODO: not storing this properly I suppose, need to change

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mjacques-furst123[0m ([33mjacques-furst123-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable

Step,Training Loss,Validation Loss
40,0.6832,0.988718
80,1.1068,0.682266
120,0.6629,0.3246
160,0.4304,0.350311
200,0.3057,0.28296
240,0.3664,0.231883
280,0.2864,0.148038
320,0.2182,0.16049
360,0.3234,0.139623
400,0.196,0.120666


  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise
  loss = F.huber_loss(logits, labels, reduction="none",

# Reload saved LoRA adapter for inference 

In [18]:
base_model_test = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)


# config = PeftConfig.from_pretrained(FINAL_LORA_ADAPTERS)
# base_model_test = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=1)

new_model = PeftModel.from_pretrained(base_model_test, FINAL_LORA_ADAPTERS)
# new_model = new_model.merge_and_unload()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
print(FINAL_LORA_ADAPTERS)

/home/jacques.furst/development/RAG/flintfiller-precondition-rl/reward_training_files/final_lora_adapters_feedback_extraction_best_window_synthetic


In [20]:
# Initialize trainer with new model
trainer = CustomRewardTrainer(
    model=new_model,
    args=training_args,
    train_dataset=datasets[0],
    eval_dataset=datasets[1],
    # compute_metrics=trainer.compute_metrics,  # Use the custom metrics function
    processing_class=tokenizer,
    loss_type="huber",  # "mse" or "huber"
    weight_strategy="linear",  # "linear", "inverse", or None
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=50)] # use early stopping since we are sing high amount of epochs
    # data_collator=RewardDataCollator()
    # torch_compile=False
)

In [21]:
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=datasets[2])
print("Test Results:", test_results)

  loss = F.huber_loss(logits, labels, reduction="none", delta=1.0) #--> balances between MSE and MAE for data that has outliers/ noise


Test Results: {'eval_loss': 0.352273166179657, 'eval_model_preparation_time': 0.0064, 'eval_runtime': 0.4502, 'eval_samples_per_second': 188.808, 'eval_steps_per_second': 48.868}


In [22]:
# evaluate model manually on some test cases
new_model.to(device)
new_model.eval()

#TODO: change tokenization function here!

with torch.no_grad():
    for i in range(20):
        sample = datasets[2][i]
        inputs = tokenizer(sample['precondition_text'] + " " + sample['response_text'], return_tensors='pt', truncation=True, padding="max_length").to(device)
        outputs = new_model(**inputs)
        prediction = outputs.logits.item()
        print(f"Sample {i+1}: Predicted Rating: {prediction}, True Rating: {sample['label']}")


Sample 1: Predicted Rating: -0.06262409687042236, True Rating: 0
Sample 2: Predicted Rating: 0.002692139707505703, True Rating: 0
Sample 3: Predicted Rating: 0.17508560419082642, True Rating: 0
Sample 4: Predicted Rating: 0.06614108383655548, True Rating: 0
Sample 5: Predicted Rating: 0.44440513849258423, True Rating: 0
Sample 6: Predicted Rating: -0.013227694667875767, True Rating: 0
Sample 7: Predicted Rating: 0.2783017158508301, True Rating: 2
Sample 8: Predicted Rating: -0.04929590970277786, True Rating: 0
Sample 9: Predicted Rating: -0.12832462787628174, True Rating: 0
Sample 10: Predicted Rating: -0.06848514825105667, True Rating: 0
Sample 11: Predicted Rating: -0.049879059195518494, True Rating: 0
Sample 12: Predicted Rating: 0.06139906495809555, True Rating: 0
Sample 13: Predicted Rating: -0.24118205904960632, True Rating: 0
Sample 14: Predicted Rating: -0.08462418615818024, True Rating: 0
Sample 15: Predicted Rating: 2.085700750350952, True Rating: 2
Sample 16: Predicted Ratin

## test reward model on prompt structure

In [23]:
response_text = """
                Inhoud: <inhoud>
                 <details>
                 <summary>parsering</summary>
                 <pre>
                 <code>
        subfact
                 </code>
                 </pre>
                 </details>
 
                 Resultaat:


                Subfact: vreemdeling 
 
                Positie: Artikel 8 IN Verordening vreemdelingenattributen
 
                Inhoud: de vreemdeling heeft in Nederland uitsluitend rechtmatig verblijf:
                <details>
                <summary>parsering</summary>
                <pre>
                <code>
        de vreemdeling
                </code>
                </pre>
                </details>
 
                Subfact: vreemdeling 
 
                Positie: Artikel 8 IN Verordening vreemdelingenattributen
 
                Inhoud: het verblijf van een vreemdeling in Nederland op grond van deze wet anders dan op de 
                gronden bedoeld in de artikelen 29 en 34
                <details>
                <summary>parsering</summary>
                <pre>
                <code>
        het verblijf van een vreemdeling
                </code>
                </pre>
                </details>
 
                Subfact: vreemdeling 
 
                Positie: Artikel 8, onder a IN Verordening vreemdelingenattributen
 
                Inhoud: op grond van een verblijfsvergunning voor bepaalde tijd als bedoeld in artikel 14;
                <details>
                <summary>parsering</summary>
                <pre>
                <code>
        op grond van een verblijfsvergunning voor bepaalde tijd
                </code>
                </pre>
                </details>
                """

precon_text = "NOT ieder die op grond van een wettelijke bepaling als Nederlander moet worden behandeld"

with torch.no_grad():
    inputs = tokenizer(precon_text + " " + response_text, return_tensors='pt', truncation=True, padding="max_length").to(device)
    outputs = new_model(**inputs)
    prediction = outputs.logits.item()
    print(f"Sample {1}: Predicted Rating: {prediction}")

Sample 1: Predicted Rating: 1.946418046951294
