In [None]:
!pip install -qq transformers
!pip install -qq datasets
!pip install -qq accelerate
!pip install -qq bitsandbytes
!pip install -qq peft
!pip install -qq trl
!pip install -qq evaluate
!pip install -qq rouge_score
!pip install -qq jiwer
!pip install -qq wandb
!pip install -qq tensorboard
!pip install -qq gradio


In [None]:
# !unzip /content/rlhf_dataset.zip

In [None]:
import os


os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
import random
import numpy as np

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [None]:
from huggingface_hub.hf_api import HfFolder
import os
# from google.colab import userdata

# os.environ["HF_TOKEN"] = userdata.get("HUGGINGFACE_TOKEN")
os.environ["HF_TOKEN"] = "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"
HfFolder.save_token(os.environ["HF_TOKEN"])


In [None]:
import os
import wandb


# os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = "2be7c86a28a2bcbeccdfa66844abfdd19b9bdabf"
wandb.login(key=os.environ["WANDB_API_KEY"])

wandb_project = "RLHF_TinyLlama"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project


In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
import gc

def clear_gpu():
    torch.clear_autocast_cache()
    torch.cuda.ipc_collect()
    torch.cuda.empty_cache()
    gc.collect()

if device == "gpu":
    clear_gpu()

In [None]:

import multiprocessing

def optimal_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

num_cpu_workers = optimal_workers()
num_cpu_workers

In [None]:
import torch

def set_seeds(seed: int=2):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seeds(2)

## Step 1 - train policy model for human evaluation

In [None]:
import pandas as pd


#df = pd.read_parquet("/content/rlhf_dataset/train_policy.parquet")
df = pd.read_parquet("/kaggle/input/rlhf-dataset/train_policy.parquet")
df.head()

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df, split='train')
train_dataset

In [None]:
import torch
from dataclasses import dataclass
from typing import Dict, List


@dataclass
class CustomDataCollator:
    def __init__(self, tokenizer, max_length, prompt_col, label_col):
        self.__tokenizer = tokenizer
        self.__max_length = max_length
        self.__prompt_col = prompt_col
        self.__label_col = label_col

    def __call__(self, samples: List[dict]):
        prompt_text = [str(s[self.__prompt_col]) for s in samples]
        label_text = [str(s[self.__label_col]) for s in samples]

        prompt_tokens = self.__tokenizer(prompt_text,
                                         return_tensors='pt',
                                         truncation=True,
                                         max_length=self.__max_length,
                                         padding="max_length")

        label_tokens = self.__tokenizer(label_text,
                                        return_tensors='pt',
                                        truncation=True,
                                        max_length=self.__max_length,
                                        padding="max_length")

        output_dict = dict()
        output_dict["input_ids"] = prompt_tokens['input_ids']
        output_dict["attention_mask"] = prompt_tokens['attention_mask']
        output_dict["labels"] = label_tokens['input_ids']
        return output_dict

In [None]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
with_quantization_config = False if device == "cpu" else True
max_length = 512

In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
custom_data_collator = CustomDataCollator(tokenizer=tokenizer,
                                          max_length=max_length,
                                          prompt_col="prompt",
                                          label_col="label")

In [None]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config= bnb_config if with_quantization_config else None,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             # torch_dtype=torch.float16
                                             )
model.config.pad_token_id = model.config.eos_token_id

In [None]:
import bitsandbytes as bnb
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names: # Needed for 16bit
            lora_module_names.remove("lm_head")

    return list(lora_module_names)

def lora_peft_model(model, TaskType):
    target_modules = find_all_linear_names(model)

    peft_config = LoraConfig(r=128,  # dimension of the updated matrices
                             lora_alpha=32,  # parameter for scaling
                             target_modules=target_modules,  # this chooses on which layers QLoRA is applied
                             lora_dropout=0.05,  # dropout probability for layers
                             bias="none",
                             task_type=TaskType)


    ## model.enable_input_require_grads()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model

In [None]:
model = lora_peft_model(model, TaskType.CAUSAL_LM)

In [None]:
from transformers import TrainingArguments


output_dir = "tiny_llama_rlhf_policy_model"

training_arguments = TrainingArguments(output_dir=output_dir,
                                       learning_rate=1e-5,
                                       per_device_train_batch_size=16,
                                       fp16=False,
                                       gradient_accumulation_steps=1,
                                       save_strategy="steps",
                                       warmup_steps=50,
                                       logging_steps=20,
                                       max_steps=20, ###########################
                                       report_to=["tensorboard"],
                                       remove_unused_columns=False,
                                       save_total_limit=1,
                                       )

In [None]:
from transformers import Trainer

trainer = Trainer(model=model,
                  args=training_arguments,
                  train_dataset=train_dataset,
                  data_collator=custom_data_collator)

In [None]:
trainer.train()

In [None]:
trainer.save_model("tiny_llama_rlhf_policy_model")
tokenizer.save_pretrained("tiny_llama_rlhf_policy_model")

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

fine_tunned_path = "tiny_llama_rlhf_policy_model"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             torch_dtype=torch.float16)

merged_model= PeftModel.from_pretrained(model, fine_tunned_path)
merged_model = merged_model.merge_and_unload()

merged_model.save_pretrained("policy_merged_model", safe_serialiaztion=True)
tokenizer.save_pretrained("policy_merged_model", safe_serialiaztion=True)

### Restart The Kernel to save GPU Memory

## Step 2: train reward function

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
MODEL_PATH = "policy_merged_model"
with_quantization_config = False if device == "cpu" else True
max_length = 512

In [None]:
import pandas as pd

# df = pd.read_parquet("/content/rlhf_dataset/train.parquet")
df = pd.read_parquet("/kaggle/input/rlhf-dataset/train.parquet")
df.head()

In [None]:
from datasets import Dataset

raw_dataset = Dataset.from_pandas(df, split='train')
raw_dataset

In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,
                                                          quantization_config= bnb_config if with_quantization_config else None,
                                                          device_map="auto",
                                                          trust_remote_code=True,
                                                          # torch_dtype=torch.float16
                                                          )
model.config.pad_token_id = model.config.eos_token_id

In [None]:
import bitsandbytes as bnb
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if "lm_head" in lora_module_names: # Needed for 16bit
            lora_module_names.remove("lm_head")

    return list(lora_module_names)

def lora_peft_model(model, task_type):
    target_modules = find_all_linear_names(model)

    peft_config = LoraConfig(r=128,  # dimension of the updated matrices
                             lora_alpha=32,  # parameter for scaling
                             target_modules=target_modules,  # this chooses on which layers QLoRA is applied
                             lora_dropout=0.05,  # dropout probability for layers
                             bias="none",
                             task_type=task_type)


    ## model.enable_input_require_grads()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model

In [None]:
model = lora_peft_model(model, TaskType.SEQ_CLS)

In [None]:
import torch
from dataclasses import dataclass
from typing import Dict, List


@dataclass
class CustomDataCollator:
    def __init__(self, tokenizer, max_length, prompt_col, chosen_col, rejected_col):
        self.__tokenizer = tokenizer
        self.__max_length = max_length
        self.__prompt_col = prompt_col
        self.__chosen_col = chosen_col
        self.__rejected_col = rejected_col

    def __call__(self, samples: List[dict]):
        chosen_text = [str(s[self.__prompt_col] + '\n' + s[self.__chosen_col]) for s in samples]
        rejected_text = [str(s[self.__prompt_col] + '\n' + s[self.__rejected_col]) for s in samples]

        chosen_tokens = self.__tokenizer.encode_plus(
            chosen_text,
            return_tensors='pt',
            padding="max_length",
            max_length=self.__max_length,
            truncation=True)

        rejected_tokens = self.__tokenizer.encode_plus(
            rejected_text,
            return_tensors='pt',
            padding="max_length",
            max_length=self.__max_length,
            truncation=True)

        output_dict = dict()
        output_dict["input_ids_chosen"] = chosen_tokens['input_ids']
        output_dict["attention_mask_chosen"] = chosen_tokens['attention_mask']

        output_dict["input_ids_rejected"] = rejected_tokens['input_ids']
        output_dict["attention_mask_rejected"] = rejected_tokens['attention_mask']

        return output_dict

In [None]:
custom_data_collator = CustomDataCollator(tokenizer=tokenizer,
                                          max_length=max_length,
                                          prompt_col="prompt",
                                          chosen_col="chosen",
                                          rejected_col="rejected")

In [None]:
from transformers import TrainingArguments
from trl import RewardConfig

output_dir = "tiny_llama_rlhf_reward_model"

training_args = RewardConfig(output_dir=output_dir,
                            learning_rate=1e-5,
                            per_device_train_batch_size=2,
                            per_device_eval_batch_size=1,
                            gradient_accumulation_steps=1,
                            fp16=False,
                            save_strategy="steps",
                            logging_steps=20,
                            max_steps=20, ######################################
                            report_to=["tensorboard"],
                            remove_unused_columns=False,
                            save_total_limit=1)

In [None]:
from trl import RewardTrainer

trainer = RewardTrainer(model=model,
                        args=training_args,
                        tokenizer=tokenizer,
                        train_dataset=raw_dataset,
                        data_collator=custom_data_collator)

In [None]:
trainer.train()

In [None]:
trainer.save_model("tiny_llama_rlhf_reward_model")
tokenizer.save_pretrained("tiny_llama_rlhf_reward_model")

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
fine_tunned_path = "tiny_llama_rlhf_reward_model"

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="auto",
                                             torch_dtype=torch.float16)

merged_model= PeftModel.from_pretrained(model, fine_tunned_path)
merged_model = merged_model.merge_and_unload()

merged_model.save_pretrained("reward_merged_model", safe_serialiaztion=True)
tokenizer.save_pretrained("reward_merged_model", safe_serialiaztion=True)

In [None]:
def get_score(model, tokenizer, prompt, response):
    instructions = tokenizer.encode_plus(prompt,
                                         response,
                                         padding="max_length",
                                         max_length=max_length,
                                         return_tensors="pt",
                                         truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**instructions)
    logits = outputs[0]

    return logits

In [None]:
prompt = raw_dataset[2]["prompt"]
example_chosen_response = raw_dataset[2]["chosen"]
example_rejected_response = raw_dataset[2]["rejected"]

In [None]:
prompt

In [None]:
example_chosen_response

In [None]:
example_rejected_response

In [None]:
loss1 = get_score(model=model,
                  tokenizer=tokenizer,
                  prompt=prompt,
                  response=example_chosen_response)

loss2 = get_score(model=model,
                  tokenizer=tokenizer,
                  prompt=prompt,
                  response=example_rejected_response)

In [None]:
from torch import nn

loss = -nn.functional.logsigmoid(loss1 - loss2).mean()
loss

In [None]:
tokenizer.decode(torch.max(loss1, axis=-1).indices[0])

In [None]:
tokenizer.decode(torch.max(loss2, axis=-1).indices[0])

### Restart The Kernel to save GPU Memory

## Step 3: RL PPO model

In [1]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
MODEL_PATH = "reward_merged_model"
max_length = 512

In [3]:
import pandas as pd
from datasets import Dataset

# df = pd.read_parquet("/content/rlhf_dataset/train.parquet")
df = pd.read_parquet("/kaggle/input/rlhf-dataset/train.parquet")
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 92534
})

In [4]:
dataset = dataset.select(range(100))

In [5]:
from trl import PPOConfig


config = PPOConfig(model_name=MODEL_PATH,
                   steps=100,
                   learning_rate=1e-5,
                   mini_batch_size=4,
                   batch_size=16,
                   remove_unused_columns=False)



In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["prompt"],
                                           return_tensors="pt",
                                           truncation=True,
                                           padding="max_length",
                                           max_length=max_length)[0]
    return sample

dataset = dataset.map(tokenize, batched=False)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset.set_format("pytorch")


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [9]:
from trl import AutoModelForCausalLMWithValueHead
from transformers import BitsAndBytesConfig

with_quantization_config = False if device == "cpu" else True

bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_compute_dtype=torch.float16)


ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH,
                                                             quantization_config= bnb_config if with_quantization_config else None,
                                                             device_map="auto",
                                                             trust_remote_code=True,
                                                              # torch_dtype=torch.float16
                                                             )
ppo_model.config.pad_token_id = ppo_model.config.eos_token_id

In [10]:
from trl import create_reference_model

ref_model = create_reference_model(ppo_model)
print(ppo_model)

AutoModelForCausalLMWithValueHead(
  (pretrained_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 2048)
      (layers): ModuleList(
        (0-21): 22 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
            (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
            (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
            (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
            (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRM

In [11]:
import bitsandbytes as bnb

optimizer = bnb.optim.Adam8bit(ppo_model.parameters(), lr=config.learning_rate)

In [12]:
from trl import PPOTrainer


ppo_trainer = PPOTrainer(config=config,
                           model=ppo_model,
                           ref_model=ref_model,
                           tokenizer=tokenizer,
                           dataset=dataset,
                           data_collator=collator,
                           optimizer=optimizer)



In [13]:
generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

In [14]:
reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # want the raw logits without softmax.
    "batch_size": 8,
}

In [15]:
from trl.core import LengthSampler


output_min_length = 128
output_max_length = max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [16]:
from transformers import pipeline


pipe = pipeline("text-classification",
                model=MODEL_PATH, 
                device=device,
               )

pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
pipe.model.config.pad_token_id = pipe.model.config.eos_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at reward_merged_model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from trl.core import LengthSampler


output_length_sampler = LengthSampler(124, max_length)

The operation is running if you see the following metrics appearing:

* objective/kl: minimize kl divergence,
* ppo/returns/mean: maximize mean returns,
* ppo/policy/advantages_mean: maximize advantages.

In [None]:
from tqdm import tqdm

max_ppo_steps = 10
label_idx = 0


for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break   

    prompt_tensors = batch["input_ids"]

    response_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()        
            
        generation_kwargs["max_new_tokens"] = max_new_tokens
        response = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        
        response_tensors.append(response.squeeze()[-max_new_tokens:])
        
    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
    rewards = pipe(query_response_pairs, **reward_kwargs)

    reward_tensors = [torch.tensor(reward[label_idx]["score"]) for reward in rewards]    

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, response_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

0it [00:00, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
1it [03:45, 225.53s/it]

objective/kl: 0.0
ppo/returns/mean: 0.1757490634918213
ppo/policy/advantages_mean: 0.001481133047491312
---------------------------------------------------------------------------------------------------


2it [06:33, 191.66s/it]

objective/kl: -12.618038177490234
ppo/returns/mean: 0.5381687879562378
ppo/policy/advantages_mean: -0.0008931616321206093
---------------------------------------------------------------------------------------------------


In [None]:
batch_size = 16
compare_results = {}

df_batch = dataset[0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

response_tensors_ref = []
response_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len
    
    response = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(response)

    response = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device), 
        **generation_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(response)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(response_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[label_idx]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[label_idx]["score"] for reward in rewards_after]

In [None]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

In [None]:
ppo_model.save_pretrained("RLHF Model")
tokenizer.save_pretrained("RLHF Model")