# RL Training

In [1]:
import torch
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from trl import PPOConfig, create_reference_model, AutoModelForCausalLMWithValueHead
from trl import GRPOTrainer, GRPOConfig
from datasets import DatasetDict

from peft import LoraConfig, get_peft_model, PeftModel, prepare_model_for_kbit_training

import os
from dotenv import load_dotenv
from utils import CustomRewardFunction, LabelPreservingCollator, CustomRewardFunctionPPOTrainer
from ppo_trainer_custom import CustomPPOTrainer
import pandas as pd
from accelerate import Accelerator

from types import MethodType
# import sys
# import wandb

# # Add the parent directory to the Python path
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# Load environment variables from .env file
load_dotenv()

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print(os.environ["CUDA_VISIBLE_DEVICES"])

# import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.is_available())

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

# device = torch.device("cpu")
accelerator = Accelerator()

1,2
2.7.1+cu126
12.6
90501
True
There are 2 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("RL_ALGORITHM")
REWARD_MODEL = os.getenv("REWARD_MODEL_NAME")
REWARD_MODEL_EXTRACTION_LORA = os.getenv("REWARD_MODEL_EXTRACTION_LORA")
REWARD_MODEL_DETECTION_LORA = os.getenv("REWARD_MODEL_DETECTION_LORA")
RL_TOKENIZATION = "best_window"
MAX_LENGTH = int(os.getenv("RL_MAX_LENGTH"))
STRIDE = int(os.getenv("RL_STRIDE"))
PROMPT_DATASET = os.getenv("PROMPT_DATASET_CSV")
DETECTION_DIFFERENCE = int(os.getenv("DETECTION_DIFFERENCE"))
WEIGHT_EXTRACTION = float(os.getenv("WEIGHT_EXTRACTION"))
WEIGHT_DETECTION = float(os.getenv("WEIGHT_DETECTION"))
RL_TRAINING_FILES = os.getenv("RL_TRAINING_FILES") + "_" + ALGORITHM

## Load prompt dataset

In [3]:
prompt_df = pd.read_csv(PROMPT_DATASET, sep=";")
dataset = Dataset.from_pandas(prompt_df)

#TODO: test whether everything is well-separated
train_test_split = dataset.train_test_split(test_size=0.3, seed=42)
eval_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)


final_splits = DatasetDict({
    'train': train_test_split['train'],
    'validation': eval_test_split['train'],
    'test': eval_test_split['test']
})

In [4]:
print(prompt_df.columns)
print(len(prompt_df))
print(dataset.column_names)

Index(['prompt', 'precondition_texts', 'precondition_positions'], dtype='object')
26
['prompt', 'precondition_texts', 'precondition_positions']


In [5]:
#TODO: do train test eval split

## Load the model

### Dataset columns

1. prompt
2. precondition_text_dict --> key: id, value: text 
3. precondition_position_dict --> key: id, value: position

Think about whether any other components are needed...
Need to iterate through all preconditions to get reward, or return several rewards per response and finetune model on each one --> iteration should be good enough, just need to define reward function properly for this...

In [6]:
# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                             device_map={"": accelerator.process_index},  # For GPU/TPU acceleration
                                             torch_dtype=torch.bfloat16,
                                            #  load_in_4bit=True,
                                             quantization_config={
                                                "load_in_4bit": True,
                                                "bnb_4bit_compute_dtype": torch.bfloat16,
                                                "bnb_4bit_use_double_quant": True,
                                                "bnb_4bit_quant_type": "nf4"
                                                }
                                            )   # Optimize precision)


tokenizer = AutoTokenizer.from_pretrained(MODEL, truncation=False, padding=False)
tokenizer.pad_token = tokenizer.eos_token

qlora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # depends on the model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

base_model.gradient_checkpointing_enable()

# Prepare for QLoRA fine-tuning
base_model = prepare_model_for_kbit_training(base_model)

# Apply QLoRA
policy_model = get_peft_model(base_model, qlora_config)



# device = model.device



print(f"tokenizer ma length: {tokenizer.model_max_length}")

# Load reward model feedback extraction
reward_model = AutoModelForSequenceClassification.from_pretrained(REWARD_MODEL, num_labels=1)
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL)

extraction_model = PeftModel.from_pretrained(reward_model, REWARD_MODEL_EXTRACTION_LORA).to(device)
# extraction_model = extraction_model.merge_and_unload()

detection_model = PeftModel.from_pretrained(reward_model, REWARD_MODEL_DETECTION_LORA).to(device)
# detection_model = detection_model.merge_and_unload()


# Create the custom reward function
reward_function = CustomRewardFunction(extraction_model, detection_model, reward_tokenizer, MAX_LENGTH, STRIDE, RL_TOKENIZATION, device, weight_extraction=WEIGHT_EXTRACTION, weight_detection=WEIGHT_DETECTION, detection_difference=DETECTION_DIFFERENCE)

Loading checkpoint shards: 100%|██████████| 3/3 [00:11<00:00,  3.86s/it]


tokenizer ma length: 1000000000000000019884624838656


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## LoRA config and wnadb init

In [7]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type='CAUSAL_LM',  
)

# wandb.init(project="RL-preconditions", name="grpo-run-test")

# GRPO Training Setup

In [8]:
if ALGORITHM == "GRPO":

    training_args = GRPOConfig(
        output_dir=RL_TRAINING_FILES, 
        logging_steps=1, 
        gradient_checkpointing=True,
        learning_rate=1e-5,
        num_train_epochs=10,
        weight_decay=0.01,
        warmup_steps=5, # TODO:check if this makes any sense at all
        logging_dir="logs",
        # save_steps=1,
        # save_total_limit=2,
        eval_strategy="epoch",
        save_strategy="epoch",
        # eval_steps=1,
        # batch_size=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        gradient_accumulation_steps=3, #TODO: think about whether this is truly necessary
        report_to="wandb",
        )

    # Initialize GRPO trainer
    trainer = GRPOTrainer(
        model=policy_model,
        reward_funcs=reward_function,
        train_dataset=final_splits['train'],
        eval_dataset=final_splits['validation'],
        args=training_args,
        # **grpo_config
        peft_config=lora_config
    )

#TODO: maybe get a learning rate scheduler for this...



## Code from Huggingface TRL

## Notes:

1. Do not use SLURM since I am in a single-node multi GPU setting and SLURM would work with scheduled training on a multi node cluster... --> use accelerate instead

2. install transformers accelerate deepspeed trl

3. DAPO paper for some hyperparameter settings, DeepSeekMath paper for hyperparametersettings is good as well

# PPO Training Setup

In [9]:
# #TODO: initialize proper weights here
print(device)
# device = torch.device("cpu")

cuda


In [10]:
#TODO: Prepare dataset here
# Need to tokenize to use for PPO

def tokenize_and_keep_original(example):
    # Tokenize the "text" column
    tokenized = tokenizer(example["prompt"], truncation=False, padding=False, max_length=2000)
    # Keep the original text
    # tokenized["original_text"] = example["prompt"]
    return tokenized

# Apply the function to the dataset
tokenized_dataset = dataset.map(tokenize_and_keep_original, batched=True)

# Create label column for this to be handled properly in PPO Trainer
def create_label(example):
    return {"additional_entries": (example["prompt"], example["precondition_texts"], example["precondition_positions"])}

tokenized_dataset = tokenized_dataset.map(create_label)
tokenized_dataset = tokenized_dataset.remove_columns(["prompt", "precondition_texts", "precondition_positions"])
print(tokenized_dataset[0].keys())
# print(tokenized_dataset[0]["prompt"])

#TODO: do train test split on this

#TODO: test whether everything is well-separated
train_test_split_PPO = tokenized_dataset.train_test_split(test_size=0.3, seed=42)
eval_test_split_PPO = train_test_split_PPO["test"].train_test_split(test_size=0.5, seed=42)


final_splits_PPO = DatasetDict({
    'train': train_test_split_PPO['train'],
    'validation': eval_test_split_PPO['train'],
    'test': eval_test_split_PPO['test']
})

# use own data collator that does not pad label column
data_collator = LabelPreservingCollator(tokenizer)

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map: 100%|██████████| 26/26 [00:00<00:00, 381.25 examples/s]
Map: 100%|██████████| 26/26 [00:00<00:00, 4473.74 examples/s]

dict_keys(['input_ids', 'attention_mask', 'additional_entries'])





### Value model and ref_model

In [11]:
if ALGORITHM == "PPO":
    #TODO: use create reference model function here instead...

    ref_model = create_reference_model(policy_model)
    # ref_model.to(model.device)
    # load the value model with same peft setup as the policy model
    
    # can add value head to policy model here
    policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(policy_model, 
                                                                    peft_config=qlora_config, 
                                                                    device_map={"": accelerator.process_index},  # For GPU/TPU acceleration
                                                                    )
    policy_model.base_model_prefix = "pretrained_model"

    def score(self, hidden_states):
        return self.v_head(hidden_states).squeeze(-1)

    policy_model.score = MethodType(score, policy_model)

    #TODO: use accelerator.process_index here maybe

    reward_function_PPO = CustomRewardFunctionPPOTrainer(extraction_model, 
                                                         detection_model, 
                                                         reward_tokenizer, 
                                                         MAX_LENGTH, 
                                                         STRIDE, 
                                                         RL_TOKENIZATION, 
                                                         device, 
                                                         weight_extraction=WEIGHT_EXTRACTION, 
                                                         weight_detection=WEIGHT_DETECTION, 
                                                         detection_difference=DETECTION_DIFFERENCE)


In [None]:
if ALGORITHM == "PPO":

    

    training_args_PPO = PPOConfig(
        output_dir=RL_TRAINING_FILES, 
        logging_steps=10, 
        gradient_checkpointing=True,
        learning_rate=1e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        warmup_steps=100,
        logging_dir="logs",
        save_steps=1000,
        save_total_limit=2,
        eval_strategy="steps",
        eval_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="wandb",
        response_length=3200,
        # local_rollout_forward_batch_size=2000,
        # per_device_train_batch_size=128,
        # local_mini_batch_size=2000,
        # response_length=1000,
        )

    # Initialize GRPO trainer
    trainer = CustomPPOTrainer(
        model=policy_model,
        reward_func=reward_function_PPO,
        # collator_max_length=2000,
        train_dataset=final_splits_PPO['train'],
        eval_dataset=final_splits_PPO['validation'],
        args=training_args_PPO,
        ref_model=ref_model,
        value_model=policy_model,
        # **grpo_config
        peft_config=lora_config,
        processing_class=tokenizer,
        data_collator=data_collator
    )

Ideas to replace reward model with actual reward function:

1. Get stub reward model but make sure it is not used anywhere
2. overwrite get_reward function

# Training and Evaluation

In [13]:
#TODO: add metrics to evaluate training like reward, KL divergence (how much does finetuned model differ from original one), entropy of the policy (exploration versus exploitation), sampling outputs, 

In [14]:
# Train
trainer.train()

===training policy===


[34m[1mwandb[0m: Currently logged in as: [33mjacques-furst123[0m ([33mjacques-furst123-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Query as passed into reward model: ['\n\n\n                --- Definitie ---\n\n                Preconditie: Een preconditie beschrijft de omstandigheden waaronder de handeling wettelijk kan worden uitgevoerd.\n                Act: Een act kan worden uitgevoerd door een agent binnen het normatieve systeem dat wordt gedefinieerd door het juridische document.\n                Fact: Fact frames beschrijven zaken waarvan de aanwezigheid of waarheidswaarde de toestand van het normatieve systeem kenmerkt. \n\n                \n\n                \n\n                                --- Gedachteketen ---\n\n                                1. Zoek alle vermeldingen van de fact in de tekst.\n                                2. Zoek in de artikelen waarin de fact wordt genoemd naar specifieke subfacts voor de fact. \n                                3. Zoek ook naar specifieke verwijzingen naar andere artikelen waarin mogelijk andere subfacts voor de fact worden genoemd.\n                           

IndentationError: unexpected indent (<unknown>, line 4)

In [None]:
#TODO: training and validation loss are near zero, need to debug