# TODOS

1. find good config for GRPO
2. Add LoRA layer to model
3. Add PPO trainer

# Reward model training

In [None]:
import torch
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, AutoModelForCausalLM
from trl import PPOTrainer, PPOConfig, create_reference_model
from trl import GRPOTrainer, GRPOConfig

from trl import RewardTrainer, RewardConfig

from peft import LoraConfig, get_peft_model, PeftModel

from tqdm import tqdm

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# load the relevant devices available on the server
os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("AVAILABLE_DEVICES")

# Enable expandable CUDA segments
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# load cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

In [None]:
MODEL = os.getenv("GENERATION_MODEL")
REWARD_MODEL = os.getenv("REWARD_MODEL_NAME")
REWARD_MODEL_EXTRACTION_LORA = os.getenv("REWARD_MODEL_EXTRACTION_LORA")
REWARD_MODEL_DETECTION_LORA = os.getenv("REWARD_MODEL_DETECTION_LORA")

# RL Training loop

## Load the model

In [None]:
# reward function definition

#TODO: per prompt and response, iterate through the preconditions/subfacts and their positions and add up the rewards for all of them...

# Example: Get reward for a response
def reward_extraction(prompt, response):
    inputs = tokenizer(prompt, response, return_tensors="pt", truncation=True)
    return reward_model(**inputs).logits.item()

In [None]:
# Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL,  
                                             device_map="auto",  # For GPU/TPU acceleration
                                             torch_dtype="auto")   # Optimize precision)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Load reward model feedback extraction
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=1)

extraction_model = PeftModel.from_pretrained(base_model, REWARD_MODEL_EXTRACTION_LORA)
extraction_model = extraction_model.merge_and_unload()

detection_model = PeftModel.from_pretrained(base_model, REWARD_MODEL_DETECTION_LORA)
detection_model = detection_model.merge_and_unload()

In [None]:
# Define GRPO config
grpo_config = {
    "batch_size": 32,
    "learning_rate": 1e-5,
    "gamma": 0.99,  # Discount factor
    "clip_range": 0.2,  # Similar to PPO
}

training_args = GRPOConfig(
    output_dir="Mistral-7B-Instruct-v0.3-GRPO", 
    logging_steps=10, 
    gradient_checkpointing=True,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir="logs",
    save_steps=1000,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    )

# Initialize GRPO trainer
trainer = GRPOTrainer(
    model=model,
    tokenizer=tokenizer,
    reward_funcs=[reward_extraction, reward__detection],
    reward_weights=[1.0, 1.0],
    args=training_args,
    **grpo_config
)

# Train
trainer.train(prompts_dataset)

## Code from Huggingface TRL

## Notes:

1. Do not use SLURM since I am in a single-node multi GPU setting and SLURM would work with scheduled training on a multi node cluster... --> use accelerate instead

2. install transformers accelerate deepspeed trl

3. DAPO paper for some hyperparameter settings, DeepSeekMath paper for hyperparametersettings is good as well

# PPO Training