# RL Training

In [None]:
import torch
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig, create_reference_model
from trl import GRPOTrainer, GRPOConfig
from datasets import DatasetDict

from trl import RewardTrainer, RewardConfig

from peft import LoraConfig, get_peft_model, PeftModel

from tqdm import tqdm

import os
from dotenv import load_dotenv
from utils import CustomRewardFunction
import pandas as pd
# import sys
import wandb

# # Add the parent directory to the Python path
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

# Load environment variables from .env file
load_dotenv()

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

There are 1 GPU(s) available.
CUDA is available. Using GPU: NVIDIA L40S


In [2]:
MODEL = os.getenv("GENERATION_MODEL_NAME")
ALGORITHM = os.getenv("RL_ALGORITHM")
REWARD_MODEL = os.getenv("REWARD_MODEL_NAME")
REWARD_MODEL_EXTRACTION_LORA = os.getenv("REWARD_MODEL_EXTRACTION_LORA")
REWARD_MODEL_DETECTION_LORA = os.getenv("REWARD_MODEL_DETECTION_LORA")
RL_TOKENIZATION = "best_window"
MAX_LENGTH = int(os.getenv("RL_MAX_LENGTH"))
STRIDE = int(os.getenv("RL_STRIDE"))
PROMPT_DATASET = os.getenv("PROMPT_DATASET_CSV")
DETECTION_DIFFERENCE = int(os.getenv("DETECTION_DIFFERENCE"))
WEIGHT_EXTRACTION = float(os.getenv("WEIGHT_EXTRACTION"))
WEIGHT_DETECTION = float(os.getenv("WEIGHT_DETECTION"))
RL_TRAINING_FILES = os.getenv("RL_TRAINING_FILES") + "_" + ALGORITHM

## Load prompt dataset

In [3]:
prompt_df = pd.read_csv(PROMPT_DATASET, sep=";")
dataset = Dataset.from_pandas(prompt_df)

#TODO: test whether everything is well-separated
train_test_split = dataset.train_test_split(test_size=0.3, seed=42)
eval_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)


final_splits = DatasetDict({
    'train': train_test_split['train'],
    'validation': eval_test_split['train'],
    'test': eval_test_split['test']
})

In [4]:
print(prompt_df.columns)
print(len(prompt_df))

Index(['prompt', 'precondition_texts', 'precondition_positions'], dtype='object')
26


In [5]:
#TODO: do train test eval split

## Load the model

### Dataset columns

1. prompt
2. precondition_text_dict --> key: id, value: text 
3. precondition_position_dict --> key: id, value: position

Think about whether any other components are needed...
Need to iterate through all preconditions to get reward, or return several rewards per response and finetune model on each one --> iteration should be good enough, just need to define reward function properly for this...

In [6]:
# Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                             device_map="auto",  # For GPU/TPU acceleration
                                             torch_dtype="auto")   # Optimize precision)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Load reward model feedback extraction
base_model = AutoModelForSequenceClassification.from_pretrained(REWARD_MODEL, num_labels=1)
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL)

extraction_model = PeftModel.from_pretrained(base_model, REWARD_MODEL_EXTRACTION_LORA).to(device)
# extraction_model = extraction_model.merge_and_unload()

detection_model = PeftModel.from_pretrained(base_model, REWARD_MODEL_DETECTION_LORA).to(device)
# detection_model = detection_model.merge_and_unload()


# Create the custom reward function
reward_function = CustomRewardFunction(extraction_model, detection_model, reward_tokenizer, MAX_LENGTH, STRIDE, RL_TOKENIZATION, device, weight_extraction=WEIGHT_EXTRACTION, weight_detection=WEIGHT_DETECTION, detection_difference=DETECTION_DIFFERENCE)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.38it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## LoRA config

In [7]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type='CAUSAL_LM',  
)

# GRPO Training Setup

In [None]:
if ALGORITHM == "GRPO":

    training_args = GRPOConfig(
        output_dir=RL_TRAINING_FILES, 
        logging_steps=1, 
        gradient_checkpointing=True,
        learning_rate=1e-5,
        num_train_epochs=10,
        weight_decay=0.01,
        warmup_steps=5, # TODO:check if this makes any sense at all
        logging_dir="logs",
        # save_steps=1,
        # save_total_limit=2,
        eval_strategy="epoch",
        save_strategy="epoch",
        # eval_steps=1,
        # batch_size=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        gradient_accumulation_steps=3, #TODO: think about whether this is truly necessary
        report_to="wandb",
        )

    # Initialize GRPO trainer
    trainer = GRPOTrainer(
        model=model,
        reward_funcs=reward_function,
        train_dataset=final_splits['train'],
        eval_dataset=final_splits['validation'],
        args=training_args,
        # **grpo_config
        peft_config=lora_config
    )

#TODO: maybe get a learning rate scheduler for this...



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Code from Huggingface TRL

## Notes:

1. Do not use SLURM since I am in a single-node multi GPU setting and SLURM would work with scheduled training on a multi node cluster... --> use accelerate instead

2. install transformers accelerate deepspeed trl

3. DAPO paper for some hyperparameter settings, DeepSeekMath paper for hyperparametersettings is good as well

# PPO Training Setup

In [9]:
if ALGORITHM == "PPO":

    ref_model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                             device_map="auto",  # For GPU/TPU acceleration
                                             torch_dtype="auto")   # Optimize precision)
    
    # Freeze all parameters of the reference model
    for param in ref_model.parameters():
        param.requires_grad = False

    # load the value model with same peft setup as the policy model
    value_model = get_peft_model(model, lora_config)

    training_args = PPOConfig(
        output_dir=RL_TRAINING_FILES, 
        logging_steps=10, 
        gradient_checkpointing=True,
        learning_rate=1e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        warmup_steps=100,
        logging_dir="logs",
        save_steps=1000,
        save_total_limit=2,
        eval_strategy="steps",
        eval_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        )

    # Initialize GRPO trainer
    trainer = PPOTrainer(
        model=model,
        reward_model=reward_function,
        train_dataset=final_splits['train'],
        eval_dataset=final_splits['validation'],
        args=training_args,
        ref_model=ref_model,
        value_model=value_model,
        # **grpo_config
        peft_config=lora_config
    )

# Training and Evaluation

In [None]:
#TODO: add metrics to evaluate training like reward, KL divergence (how much does finetuned model differ from original one), entropy of the policy (exploration versus exploitation), sampling outputs, 

In [10]:
# Train
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,0.0,0.007026
2,-0.0566,0.012379
3,-0.1226,-0.069821
4,0.0005,-0.024028
5,0.0,0.003849
6,-0.0288,6.5e-05
7,0.0018,-0.016823
8,-0.0078,0.002413
9,-0.042,-0.013771
10,-0.0948,-0.000824


Token indices sequence length is longer than the specified maximum sequence length for this model (628 > 512). Running this sequence through the model will result in indexing errors


TrainOutput(global_step=60, training_loss=-0.04164725281298161, metrics={'train_runtime': 2227.4841, 'train_samples_per_second': 0.081, 'train_steps_per_second': 0.027, 'total_flos': 0.0, 'train_loss': -0.04164725281298161})

In [11]:
#TODO: training and validation loss are near zero, need to debug