# Fine Tune reward model from scratch

# TODOs:

#TODO: double-check that labels are not somehow misaligned...

#TODO: check if you need to plot 

1. LoRA learns the position of the low rank adaptation matrix that is needed to finetune a model of a much higher rank

#TODO: double check model performance, generate output, maybe adjust training metrics

## 1. Imports, setup, and global variables

In [15]:
import torch
import pandas as pd
import os
import sys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..')))

from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelForSequenceClassification




from datasets import Dataset, DatasetDict, load_from_disk

from peft import LoraConfig, get_peft_model, PeftModel

from utils import parse_ratings, tokenize_fn_with_best_window, tokenize_fn_basic_batched, CustomRewardTrainer


# from nltk.tokenize import sent_tokenize

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [16]:
# load training variables
FEEDBACK_TO_TRAIN_ON = os.getenv("FEEDBACK_TO_TRAIN_ON")
FEEDBACK_TO_REMOVE = os.getenv("FEEDBACK_TO_REMOVE")
MODEL = os.getenv("REWARD_MODEL")
DATASET = os.getenv("REWARD_DATASET")
TOKENIZE_FN = os.getenv("TOKENIZE_FN")
MAX_LENGTH = os.getenv("MAX_LENGTH")
STRIDE = os.getenv("STRIDE")
LORA_CHECKPOINTS_FOLDER = os.getenv("LORA_CHECKPOINTS_FOLDER")
FINAL_LORA_ADAPTERS = os.getenv("FINAL_LORA_ADAPTERS_FOLDER") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}"
TOKENIZED_DATA = os.getenv("TOKENIZED_DATA") + f"_{FEEDBACK_TO_TRAIN_ON}_{TOKENIZE_FN}_{DATASET}"

# load training data
FILE_1 = os.getenv("FILE_1")
FILE_5 = os.getenv("FILE_5")
FILE_7 = os.getenv("FILE_7")
FILE_9 = os.getenv("FILE_9")
FILE_10_1 = os.getenv("FILE_10_1")
FILE_10_2 = os.getenv("FILE_10_2")
FILE_SYNTH = os.getenv("FILE_SYNTH")

## 2. Dataset loading and preprocessing

In [17]:
# load dataframes
df_1 = pd.read_csv(FILE_1, sep=";")
df_5 = pd.read_csv(FILE_5, sep=";")
df_7 = pd.read_csv(FILE_7, sep=";")
df_9 = pd.read_csv(FILE_9, sep=";")
df_10_1 = pd.read_csv(FILE_10_1, sep=";")
df_10_2 = pd.read_csv(FILE_10_2, sep=";")
df_synth = pd.read_csv(FILE_SYNTH, sep=";")

df_human = pd.concat([df_1, df_5, df_7, df_9, df_10_1, df_10_2], ignore_index=True)

In [18]:
if DATASET == "human":
    df_train = df_human
elif DATASET == "synthetic":
    df_train = df_synth
    
df_train.shape

(929, 13)

### 2. a) Parse ratings to numeric values for MSE Loss

In [19]:
df_train[FEEDBACK_TO_TRAIN_ON] = [parse_ratings(feedback) for feedback in df_train[FEEDBACK_TO_TRAIN_ON]]
print("Parsed feedback for extraction:", df_train[FEEDBACK_TO_TRAIN_ON][:5])

Parsed feedback for extraction: 0    2
1    2
2    2
3    2
4    3
Name: feedback_extraction, dtype: object


### 2. b) look at biases in feedback to train on for weights in RL loop --> feedback_detection is very biased through way it was collected, so gets less weight overall...

In [20]:
df_train[FEEDBACK_TO_TRAIN_ON].value_counts()

feedback_extraction
0    499
3    231
2    104
1     95
Name: count, dtype: int64

### 2. c) keep only relevant feedback column

In [21]:
dataset = Dataset.from_pandas(df_train)

print(dataset)
print(FEEDBACK_TO_TRAIN_ON) 

Dataset({
    features: ['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id', 'precondition_text', 'precondition_position', 'response_text', 'prompt_config_examples', 'prompt_config_chain_of_thought', 'feedback_extraction', 'feedback_detection', 'additional_feedback'],
    num_rows: 929
})
feedback_extraction


In [22]:
dataset = dataset.remove_columns([FEEDBACK_TO_REMOVE])
dataset = dataset.rename_column(FEEDBACK_TO_TRAIN_ON, "label")

## 3. Load model with LoRA layer

In [23]:
# Load the model and the tokenizer
model_id = MODEL 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1) # num_labels = 1 since we want to prodict a single scalar (the rating)

# Comment: Automodel for sequence classification with num_labels=1 already has a regression head
print(model)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [24]:
print(tokenizer.model_max_length)
print(model.config.max_position_embeddings)

512
512


In [25]:
# Define LoRA config
lora_config = LoraConfig(
    r=8,           # Rank of the LoRA matrices (smaller = less memory)
    lora_alpha=16, # Scaling factor (higher = stronger adaptation)
    target_modules=["query", "key", "value"], # Apply LoRA to attention layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # classify each answer 
)

# Convert the model to a PEFT (LoRA) model
model = get_peft_model(model, lora_config)
# model.gradient_checkpointing_enable()
model.print_trainable_parameters()  # Check trainable params (~0.1% of full model)


trainable params: 443,137 || all params: 109,926,146 || trainable%: 0.4031


In [26]:
# Test tokenizer
sample_data = ["What is the capital of France?", "What is the largest capital in the world?"]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 1067, 223, 207, 580, 210, 1335, 124, 102, 0, 0, 0], [101, 1067, 223, 207, 5601, 190, 580, 213, 207, 1727, 124, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## 5. Encode dataset

In [27]:
# if labels are not integers, convert them to integers
def convert_label_to_int(data):
    data["label"] = int(data["label"])
    return data


print(dataset.column_names)
# mao string labels to integers
dataset = dataset.map(convert_label_to_int)  # Assuming 'text' is the column with the text data

print(dataset["label"][:5])  # Check labels
print(dataset["response_text"][:5])  # Check labels

['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id', 'precondition_text', 'precondition_position', 'response_text', 'prompt_config_examples', 'prompt_config_chain_of_thought', 'label', 'additional_feedback']


Map: 100%|██████████| 929/929 [00:00<00:00, 16730.97 examples/s]

[2, 2, 2, 2, 3]
['1. Subfact: Onze Minister\n                2. Positie: Artikel 1, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                3. Subfact: Onze Minister\n                4. Positie: Artikel 8, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                5. Subfact: Onze Minister\n                6. Positie: Artikel 14, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                7. Subfact: Onze Minister\n                8. Positie: Artikel 16, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                9. Subfact: Onze Minister\n                10. Positie: Artikel 17, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                11. Subfact: Onze Minister\n                12. Positie: Artikel 17a, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                13. Subfact: Onze Minister\n                14. Positie: Artikel 26, sectie 1 IN Vreemdelingenwet geldig vanaf 2024\n                15. Subfact: Onze Minister\n                16. Positie: Arti




## Comment

1. Needed for feedback extraction: precondition_text, response_text, label(rating feedback extraction)
2. Needed for feedback detection: precondition_text, precondition_position, response_text, label (rating feedback detection)
3. For the precondition position to be found well, it is a crucial for the model to find the precondition text (at least to a recognizable degree) as well, otherwise the precondition is not found at all...

In [28]:
# sample = dataset.select(range(5))
# tokenized_sample = tokenize_fn(sample)
# print([len(ids) for ids in tokenized_sample["input_ids"]])

if not os.path.exists(TOKENIZED_DATA):
    if TOKENIZE_FN == "best_window":
        dataset = dataset.map(tokenize_fn_with_best_window, 
                              fn_kwargs={"feedback_train": FEEDBACK_TO_TRAIN_ON, 
                                         "tokenizer": tokenizer, 
                                         "max_length": int(MAX_LENGTH), 
                                         "stride": int(STRIDE),
                                         "device": device
                                         },
                              batched=False)
    else:
        dataset = dataset.map(tokenize_fn_basic_batched, 
                              fn_kwargs={"feedback_train": FEEDBACK_TO_TRAIN_ON, 
                                         "tokenizer": tokenizer 
                                         },
                              batched=True)
    
    dataset.save_to_disk(TOKENIZED_DATA)
else:
    dataset = load_from_disk(TOKENIZED_DATA)

Map: 100%|██████████| 929/929 [00:00<00:00, 3708.29 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 929/929 [00:00<00:00, 168381.16 examples/s]


# Split dataset into train, test, eval

In [29]:
# split into train, test and eval sets
train_test_split = dataset.train_test_split(test_size=0.3, seed=42)
eval_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)


final_splits = DatasetDict({
    'train': train_test_split['train'],
    'validation': eval_test_split['train'],
    'test': eval_test_split['test']
})

## 6. Train reward model

In [30]:
# Training arguments
training_args = TrainingArguments(
    output_dir=LORA_CHECKPOINTS_FOLDER,
    eval_strategy='steps',
    save_strategy='steps',
    save_steps=100,
    eval_steps=100,
    save_total_limit=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    num_train_epochs=20,
    logging_steps=10,
    label_names=["labels"],
    # report_to="none",
    logging_dir="./logs",
    fp16=True,  # Use mixed precision training
    metric_for_best_model="eval_loss", # or "eval_loss"
    greater_is_better=False, # False if using loss
    # gradient_accumulation_steps=4 # 
)

# Initialize custom trainer
trainer = CustomRewardTrainer(
    model=model,
    args=training_args,
    train_dataset=final_splits['train'],
    eval_dataset=final_splits['validation'],
    # compute_metrics=trainer.compute_metrics,  # Use the custom metrics function
    processing_class=tokenizer,
    loss_type="huber",  # "mse" or "huber"
    weight_strategy="linear",  # "linear", "inverse", or None
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # use early stopping since we are sing high amount of epochs
    # data_collator=RewardDataCollator()
)

# # add distributioncallback to trainer TODO: only integrate if relevant
# trainer.add_callback(DistributionCallback())

print(trainer.args.device)

cuda:0


In [31]:
#train model
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss
100,1.0474,1.194752
200,0.6404,0.79419
300,0.6016,0.784183
400,0.4256,0.6042
500,0.4177,0.636476
600,0.3784,0.612518
700,0.4021,0.559864
800,0.3853,0.616722


TrainOutput(global_step=820, training_loss=0.6142073491724526, metrics={'train_runtime': 59.3654, 'train_samples_per_second': 218.983, 'train_steps_per_second': 13.813, 'total_flos': 3438110128128000.0, 'train_loss': 0.6142073491724526, 'epoch': 20.0})

In [32]:
# store final model parameters
model.save_pretrained(FINAL_LORA_ADAPTERS)

# Reload saved LoRA adapter for inference 

In [33]:
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)
model = PeftModel.from_pretrained(base_model, FINAL_LORA_ADAPTERS)
model = model.merge_and_unload()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=final_splits['test'])
print("Test Results:", test_results)

Test Results: {'eval_loss': 0.5376802086830139, 'eval_runtime': 0.2384, 'eval_samples_per_second': 587.255, 'eval_steps_per_second': 37.752, 'epoch': 20.0}


In [35]:
# evaluate model manually on some test cases
model.to(device)
model.eval()

with torch.no_grad():
    for i in range(20):
        sample = final_splits['test'][i]
        inputs = tokenizer(sample['precondition_text'] + " " + sample['response_text'], return_tensors='pt', truncation=True, padding="max_length").to(device)
        outputs = model(**inputs)
        prediction = outputs.logits.item()
        print(f"Sample {i+1}: Predicted Rating: {prediction}, True Rating: {sample['label']}")


Sample 1: Predicted Rating: 2.8296525478363037, True Rating: 1
Sample 2: Predicted Rating: 0.0598440058529377, True Rating: 0
Sample 3: Predicted Rating: 2.578739881515503, True Rating: 3
Sample 4: Predicted Rating: 0.08147154748439789, True Rating: 0
Sample 5: Predicted Rating: 0.6681082844734192, True Rating: 3
Sample 6: Predicted Rating: 0.173975870013237, True Rating: 0
Sample 7: Predicted Rating: 2.470729112625122, True Rating: 1
Sample 8: Predicted Rating: 0.2174949198961258, True Rating: 0
Sample 9: Predicted Rating: -0.02062557265162468, True Rating: 0
Sample 10: Predicted Rating: 0.08285029232501984, True Rating: 0
Sample 11: Predicted Rating: 0.0437869168817997, True Rating: 0
Sample 12: Predicted Rating: 1.4138603210449219, True Rating: 0
Sample 13: Predicted Rating: 0.12305611371994019, True Rating: 0
Sample 14: Predicted Rating: 3.0094668865203857, True Rating: 1
Sample 15: Predicted Rating: 2.069288492202759, True Rating: 0
Sample 16: Predicted Rating: 0.02712217345833778