## Imports

In [3]:
import torch
from tqdm import tqdm
import gc

tqdm.pandas()

from transformers import AutoTokenizer,LlamaTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, set_seed,create_reference_model


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/st-gorbatovski/.conda/envs/gorbatovski_env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/st-gorbatovski/.conda/envs/gorbatovski_env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


## Configs

In [4]:
config = PPOConfig(
    model_name="/raid/models/llama-7b-hf",
    steps=20000,
    learning_rate=1.41e-5,
    # log_with="wandb",
    batch_size=64,
    optimize_cuda_cache=True,
    gradient_accumulation_steps=1,
    tracker_project_name='SO_LLAMA',
    max_grad_norm=1,
    ppo_epochs=4,  # default
    seed=42,
    remove_unused_columns=False,
    tracker_kwargs = {'name': 'ppo-train-bs_64-mbs_8'},
    forward_batch_size = 8,
)

data_config = dict(data_file_path='/root/CQA_RLHF/data/1.0-data-div-ans-sep-api-usage.json',
                   max_length_promt=256,
                   truncate_promt=True,
                   padding=False)

reward_config = {
    'reward_model_name': "Myashka/125M_GPTneo_reward_gen",
    'batch_size': 16,
}

save_config = {
    'save_interval': 32,
    'checkpoint_dir': '/ckpts_ppo'
}

generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "max_new_tokens": 256,
}

config.data_config = data_config
config.save_config = save_config
config.generation_kwargs = config
config.reward_config = reward_config



## Load pre-trained GPT Neo LLM

In [3]:
from peft import LoraConfig, get_peft_model_state_dict

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [4]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    torch_dtype=torch.float16,
    device_map="cuda:1",
    peft_config=lora_config,
)

Loading checkpoint shards: 100%|██████████| 33/33 [00:12<00:00,  2.74it/s]


In [10]:
model.gradient_checkpointing_enable()

In [5]:
tokenizer = LlamaTokenizer.from_pretrained(config.model_name)
tokenizer.padding_side = 'left'

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [7]:
ppo_trainer = PPOTrainer(config, model, ref_model=None, tokenizer=tokenizer)



In [8]:
generation_kwargs = {
    "temperature": 0.9,
    "top_k": 50,
    "top_p": 0.9,
    "do_sample": True,
    "no_repeat_ngram_size": 2,
    # "use_cache": True,
    "max_new_tokens": 256,
}

In [14]:
tokenizer('What is the best woman in the world? The answer is', return_tensors='pt')['input_ids'][0]

tensor([    1,  1724,   338,   278,  1900,  6114,   297,   278,  3186, 29973,
          450,  1234,   338])

In [6]:
queries = ['What is the worst man in ', 'What is the best woman in the world? The answer is']
# queries.append(tokenizer('What is the best woman in the world? The answer is', return_tensors='pt'))
# queries.append(tokenizer('What is the worst man in th is', return_tensors='pt'))

In [7]:
tokenizer(queries)

{'input_ids': [[1, 1724, 338, 278, 17322, 767, 297, 29871], [1, 1724, 338, 278, 1900, 6114, 297, 278, 3186, 29973, 450, 1234, 338]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [1]:
from torch.utils.data import DataLoader
from transformers import (
    AutoModel,
    AutoTokenizer,
    DataCollatorWithPadding,
    DataCollatorForTokenClassification,
)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
def collate_fn(batch):
    # batch - это список словарей, возвращаемых токенизатором
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]

    # Делаем паддинг до одинаковой длины
    padded_data  = tokenizer.pad(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        return_tensors="pt",
        padding="longest",
        pad_to_multiple_of=8,
    )

    input_ids = padded_data["input_ids"]
    attention_mask = padded_data["attention_mask"]


    return {"input_ids": input_ids, "attention_mask": attention_mask}

In [21]:
dataloader = DataLoader(query_list, batch_size=3, collate_fn=collate_fn)

In [22]:
next(iter(dataloader))

{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     1,  1724,
            338,   278, 17322,   767,   297, 29871],
         [    0,     0,     0,     1,  1724,   338,   278,  1900,  6114,   297,
            278,  3186, 29973,   450,  1234,   338]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
def tokenize_to_list(tokenized):
    return [
        {'input_ids': input_ids, 'attention_mask': attention_mask} 
        for input_ids, attention_mask in zip(tokenized['input_ids'], tokenized['attention_mask'])
    ]

query_list = tokenize_to_list(tokenizer(queries))

In [4]:
import torch

In [18]:
list(torch.tensor([1, 2, 3]).unsqueeze(1))

[tensor([1]), tensor([2]), tensor([3])]

In [16]:
gen = ppo_trainer.generate(query_tensor=queries, return_prompt=False, **generation_kwargs)



In [17]:
tokenizer.batch_decode(gen, skip_special_tokens=True)

['simple: a woman who can give birth to a child! But, to have the child, you need to know how to become pregnant, what is your most fertile time, and how much sperm is needed to achieve this.\nScientists have proved that a normal man can provide sperms to support the fertilization of an egg only 30-35 times a month. This is a fairly small amount of sputum, which leads to the fact that conception is not so easy. To increase the chance of conceiving, a man needs to make a series of actions. The first of them is to understand when his semen is in a sufficiently high concentration. In the course of sexual intercourse, sophisticated male fluids are diluted with blood and other secretions. If a small number of these fluorescent substances was in saturated semen, then conception could not occur. So, when can you make love to get preggy?\nThe best time to conceive a baby is when the seminal fluid is of the highest quality. It happens when a male body has received a hormonal signal, namely lute

In [30]:
ppo_trainer.generate(query_tensor=queries)



[tensor([    1,  1724,   278,  7483,   310, 19289, 29973,    13,  5618,   338,
           278,  7483,   310, 19289, 29973,    13,  1576,  7483,   310, 19289],
        device='cuda:0'),
 tensor([    1, 15043,  3186, 29973,    13,    13, 20001, 29901,   306, 29915,
         29885,   451,  1854,   825,   366,  2099,   491], device='cuda:0')]

In [45]:
query_tensor["input_ids"].unsqueeze(0).shape

torch.Size([1, 1, 7])

In [7]:
ppo_trainer.model.save_pretrained("/home/st-gorbatovski/sollama/src/mpnet_reward/rlhf/artifacts")

In [12]:
model.pretrained_model.gradient_checkpointing_enable()
if config["training_arguments"]["gradient_checkpointing"]:
    model.gradient_checkpointing_enable()
    model.config.use_cache = not config["training_arguments"]["gradient_checkpointing"]

## Build Dataset

In [None]:
dataset = build_dataset(config, data_config, ['train'])[0]

In [None]:
set_seed(config.seed)

## Initialize PPOTrainer

In [None]:
ppo_trainer = PPOTrainer(config, model, tokenizer, dataset=dataset, data_collator=collator, num_shared_layers=None)

## Reward pipeline

In [None]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

reward_pipe = Reward_pipeline(reward_config['reward_model_name'], ppo_trainer.accelerator)

In [70]:
wandb_tracker = ppo_trainer.accelerator.get_tracker("wandb", unwrap=True)

## Training part

In [None]:
generation_kwargs["pad_token_id"] = tokenizer.eos_token_id
best_reward = float('-inf')
global_epoches = 10

for global_epo in tqdm(range(global_epoches)):
    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        query_tensors = batch["input_ids"]
        response_tensors = []

        for query in query_tensors:
            response = ppo_trainer.generate(query, **generation_kwargs)
            response_tensors.append(response.squeeze())
        # batch["question_answer"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
        # batch["query"] = [tokenizer.decode(query_idx, skip_special_tokens=True) for query_idx in batch["input_ids"]]
        batch["response"] = [tokenizer.decode(r.squeeze()[len(query_idx):], skip_special_tokens=True) for r, query_idx in zip(response_tensors, batch["input_ids"])]

        #### Compute sentiment score
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        rewards = reward_pipe(texts, reward_config['batch_size'])

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

        mean_reward = torch.mean(torch.tensor(rewards))

        del batch
        del rewards
        gc.collect()
        torch.cuda.empty_cache()

        if (epoch + 1) % save_config['save_interval'] == 0:
            ppo_trainer.accelerator.wait_for_everyone()
            if ppo_trainer.accelerator.is_main_process:
                unwrapped_model = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model)
                save_checkpoint(unwrapped_model, wandb_tracker, global_epo, epoch, mean_reward, save_config['checkpoint_dir'], 'ppo_checkpoint', config.tracker_kwargs['name'])

        if mean_reward > best_reward:
            ppo_trainer.accelerator.wait_for_everyone()
            if ppo_trainer.accelerator.is_main_process:
                unwrapped_model = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model)
                save_checkpoint(unwrapped_model, wandb_tracker, global_epo, epoch, mean_reward, save_config['checkpoint_dir'], 'max_reward_ppo', config.tracker_kwargs['name'])

            best_reward = mean_reward
        
    ppo_trainer.accelerator.wait_for_everyone()
    if ppo_trainer.accelerator.is_main_process:
        unwrapped_model = ppo_trainer.accelerator.unwrap_model(ppo_trainer.model)
        save_checkpoint(unwrapped_model, wandb_tracker, global_epo, epoch, mean_reward, save_config['checkpoint_dir'], 'last_checkpoint', config.tracker_kwargs['name'])