Set the following environment variables:
> export WANDB_PROJECT=GenAI360

> export TOKENIZERS_PARALLELISM=true

In [None]:
  !pip install -q transformers==4.32.0 accelerate==0.22.0 peft==0.5.0 trl==0.5.0 bitsandbytes==0.41.1 deeplake==3.6.19 wandb==0.15.8 sentencepiece==0.1.99

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m537.3/537.3 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py)

# Load the Deep Lake Dataset

In [None]:
import deeplake

# Connect to the training and testing datasets
ds = deeplake.load('hub://genai360/Alpaca-OrcaChat')



Opening dataset in read-only mode as you don't have write permissions.


|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/genai360/Alpaca-OrcaChat



/

hub://genai360/Alpaca-OrcaChat loaded successfully.



 

In [None]:
print( ds )

Dataset(path='hub://genai360/Alpaca-OrcaChat', read_only=True, tensors=['input', 'instruction', 'output'])


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b", padding_side='left')

# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, ds):
        self.ds = ds

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):

      query = "Question: " + self.ds.input[idx].text() + "\n\nAnswer: "
      tokenized_question = tokenizer(query, truncation=True, max_length=400, padding='max_length', return_tensors="pt")

      formatted_input = {
        "query": query,
        "input_ids": tokenized_question["input_ids"][0],
      }

      return formatted_input

In [None]:
myTrainingLoader = MyDataset(ds)

In [None]:
iterator = iter( myTrainingLoader )
one_sample = next( iterator )
print(list(one_sample.keys()))

['query', 'input_ids']


# Initialize the Model

In [None]:
from trl import PPOConfig

config = PPOConfig(
    task_name="OPT-RL-OrcaChat",
    steps=10_000,
    model_name="facebook/opt-1.3b",
    learning_rate=1.41e-5,
    batch_size=32,
    mini_batch_size=4,
    gradient_accumulation_steps=4,
    optimize_cuda_cache=True,
    early_stopping=False,
    target_kl=0.1,
    ppo_epochs=4,
    seed=0,
    init_kl_coef=0.2,
    adap_kl_ctrl=True,
    tracker_project_name="OPT-RL-OrcaChat",
    log_with="wandb",
)

In [None]:
from trl import set_seed

# set seed before initializing value head for deterministic eval
set_seed(config.seed)

In [None]:
from accelerate import Accelerator

# Now let's build the model, the reference model, and the tokenizer.
current_device = Accelerator().local_process_index

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
from trl import AutoModelForCausalLMWithValueHead

model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
    load_in_8bit=True,
    device_map={"": current_device},
    peft_config=lora_config,
)



In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
test_data = [{"key1": "value11", "key2": "value21", "key3": "value31"}, {"key1": "value12", "key2": "value22", "key3": "value32"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value11', 'key2': 'value21', 'key3': 'value31'}, {'key1': 'value12', 'key2': 'value22', 'key3': 'value32'}]
Collator output: {'key1': ['value11', 'value12'], 'key2': ['value21', 'value22'], 'key3': ['value31', 'value32']}


In [None]:
from trl import PPOTrainer

# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    tokenizer=tokenizer,
    dataset=myTrainingLoader,
    data_collator=collator
)

[34m[1mwandb[0m: Currently logged in as: [33mala_[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Load Reward Model

In [None]:
from transformers import pipeline
import torch

reward_pipeline = pipeline(
    "sentiment-analysis",
    model="microsoft/deberta-v3-base",
    device_map={"": current_device},
    model_kwargs={"load_in_8bit": True}, # TODO: Maybe add `num_labels=1`?
    return_token_type_ids=False,
)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# RL Loop

In [None]:
from trl.core import LengthSampler

output_length_sampler = LengthSampler(32, 128) #(OutputMinLength, OutputMaxLength)

In [None]:
sft_gen_kwargs = {
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}

In [None]:
reward_gen_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16,
    "truncation": True,
    "max_length": 512
}

In [None]:
save_freq = 50

In [None]:
from tqdm import tqdm
tqdm.pandas()

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if step >= config.total_ppo_epochs:
        break
    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **sft_gen_kwargs,
    )

    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

    # Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_pipeline(texts, **reward_gen_kwargs)
    rewards = [torch.tensor(output[0]["score"], device=current_device) for output in pipe_outputs]

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if save_freq and step and step % save_freq == 0:
        print("Saving checkpoint.")
        ppo_trainer.save_pretrained(f"./OPT-RL-OrcaChat/checkpoint-{step}")



[[{'label': 'LABEL_0', 'score': 0.1302490234375}, {'label': 'LABEL_1', 'score': -0.0238037109375}], [{'label': 'LABEL_1', 'score': -0.1082763671875}, {'label': 'LABEL_0', 'score': -0.1112060546875}], [{'label': 'LABEL_0', 'score': -0.0478515625}, {'label': 'LABEL_1', 'score': -0.1378173828125}], [{'label': 'LABEL_0', 'score': 0.0989990234375}, {'label': 'LABEL_1', 'score': -0.1488037109375}], [{'label': 'LABEL_0', 'score': 0.07806396484375}, {'label': 'LABEL_1', 'score': -0.08599853515625}], [{'label': 'LABEL_1', 'score': -0.143798828125}, {'label': 'LABEL_0', 'score': -0.1868896484375}], [{'label': 'LABEL_0', 'score': 0.11407470703125}, {'label': 'LABEL_1', 'score': -0.017120361328125}], [{'label': 'LABEL_0', 'score': 0.06951904296875}, {'label': 'LABEL_1', 'score': 0.03057861328125}], [{'label': 'LABEL_1', 'score': -0.136474609375}, {'label': 'LABEL_0', 'score': -0.17138671875}], [{'label': 'LABEL_1', 'score': 0.01023101806640625}, {'label': 'LABEL_0', 'score': -0.1123046875}], [{'la

0it [02:59, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-5c408381439e>", line 25, in <cell line: 4>
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
  File "/usr/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_trainer.py", line 746, in step
    train_stats = self.train_minibatch(
  File "/usr/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_trainer.py", line 1004, in train_minibatch
    loss_p, loss_v, train_stats = self.loss(
  File "/usr/local/lib/python3.10/dist-packages/trl/trainer/ppo_trainer.py", line 1150, in loss
    entropy = masked_mean(entropy_from_logits(logits), mask)
  File "/usr/local/lib/python3.1

TypeError: ignored