### 1 - How to get train.parquet 
[Get the dataset from here](https://huggingface.co/datasets/CarperAI/openai_summarize_comparisons)
### 2 - How to get train.policy.parquet
[Get the dataset from here](https://huggingface.co/datasets/CarperAI/openai_summarize_tldr)

In [None]:
import random
import numpy as np
import torch 
import json
import torch
import pandas as pd
import datasets
from torch import nn
from datasets import load_dataset, Dataset
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
    pipeline,
    DataCollatorForLanguageModeling,
)
from trl import (
    RewardTrainer, 
    SFTTrainer,
    PPOConfig,
    PPOTrainer,
    AutoModelForCausalLMWithValueHead,
    create_reference_model
)

## Creating policy model for human evaluation

In [None]:
def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [None]:
output_dir = "./supervised-summarize-checkpoint"
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 4
eval_steps = 500
max_input_length = 512
save_steps = 1000
num_train_epochs = 5
set_seed()

In [None]:
df = pd.read_parquet("./train_policy.parquet")
df.iloc[10]

In [None]:
class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length):
        dataset = pd.read_parquet(train_path)
        self.post_list = []
        self.labels = []
        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["prompt"])
            self.labels.append(sample[1]["label"])
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []
        
    def __len__(self):
        return len(self.post_list)
    
    def __getitem__(self, idx):
        txt = self.post_list[idx]
        label = self.labels[idx]
        
        encodings_dict = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )
        encodings_dict_label = self.tokenizer(
            label,
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )
        
        input_ids = torch.tensor(encodings_dict["input_ids"])
        attn_masks = torch.tensor(encodings_dict["attention_mask"])
        label_ids = torch.tensor(encodings_dict_label["input_ids"])
        
        return {
            "input_ids": input_ids,
            "attention_mask": attn_masks,
            "labels": label_ids
        }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")
model = AutoModelForCausalLM.from_pretrained("bigcode/tiny_starcoder_py")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
data_path = "./train_policy.parquet"
train_dataset = TLDRDataset(
    train_path=data_path,
    tokenizer=tokenizer,
    split="train",
    max_length=256
)

In [None]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    fp16=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=20,
    max_steps=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

In [None]:
trainer.save_model("./summarization_policy")
tokenizer.save_pretrained("./summarization_policy")

In [None]:
model = AutoModelForCausalLM.from_pretrained("./summarization_policy")
tokenizer = AutoTokenizer.from_pretrained("./summarization_policy")

text = df.iloc[2]["prompt"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)

In [None]:
tokenizer.decode(model.generate(**tokenized_text)[0])

## Training the reward function

In [None]:
MODEL_PATH = "./summarization_policy"
DATA_PATH = "./train.parquet"

In [None]:
df = pd.read_parquet(DATA_PATH)
df = df[:100]
raw_dataset = datasets.Dataset.from_pandas(df)
raw_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

In [None]:
def formatting_func(examples):
    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": 256,
        "return_tensors": "pt"
    }
    
    prompt_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_rejected_response = examples["prompt"] + "\n" + examples["rejected"]
    
    tokens_chosen = tokenizer.encode_plus(prompt_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_rejected_response, **kwargs)
    
    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0],
        "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0],
        "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }

In [None]:
formatted_dataset = raw_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()
formatted_dataset

In [None]:
training_args = TrainingArguments(
    output_dir="./reward-model-checkpoint",
    num_train_epochs=2,
    gradient_accumulation_steps=1,
    save_strategy="steps",
    evaluation_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,
    logging_steps=10,
    eval_steps=500,
    save_steps=500,
    warmup_steps=50,
    learning_rate=1e-5,
    save_total_limit=1,
    use_cpu=True,
    max_steps=2,
    report_to="none"
)

trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
    args=training_args
)

trainer.train()

In [None]:
trainer.save_model("./reward-model")
tokenizer.save_pretrained("./reward-model")

In [None]:
model = AutoModelForCausalLM.from_pretrained("./reward-model")
tokenizer = AutoTokenizer.from_pretrained("./reward-model")

In [None]:
def get_score(model, tokenizer, prompt, response):
    instructions = tokenizer.encode_plus(
        prompt,
        response,
        padding="max_length",
        max_length=256,
        return_tensors="pt",
        truncation=True
    )
    with torch.no_grad():
        outputs = model(**instructions)
    logits = outputs[0]
    
    return logits

In [None]:
prompt = df.iloc[0]["prompt"]
example_chosen_response = df.iloc[0]["chosen"]
example_rejected_response = df.iloc[0]["rejected"]

In [None]:
loss1 = get_score(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    response=example_chosen_response
)

loss2 = get_score(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    response=example_rejected_response
)

In [None]:
loss = -nn.functional.logsigmoid(loss1 - loss2).mean()

In [None]:
tokenizer.decode(torch.max(loss1, axis=-1).indices[0])

In [None]:
tokenizer.decode(torch.max(loss2, axis=-1).indices[0])

## Policy model

In [None]:
MODEL_PATH = "./reward-model"
DATA_PATH = "./train.parquet"

In [None]:
df = pd.read_parquet(DATA_PATH)
dataset = datasets.Dataset.from_pandas(df)
dataset

In [None]:
sentiment_pipe_kwargs = {
    "top_k": None,
    "function_to_apply": "none"
}

config = PPOConfig(
    model_name=MODEL_PATH,
    steps=51200,
    learning_rate=1.41e-5,
    remove_unused_columns=True
)

txt_in_len = 5
txt_out_len = 32 
seed = 1

In [None]:
dataset = dataset.rename_columns({"prompt": "review"})
dataset = dataset.filter(lambda x: len(x["review"]) > 512, batched=False)
dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = dataset.map(
    lambda x: {
        "input_ids": tokenizer.encode(
            " " + x["chosen"],
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=32
        )[0]
    },
    batched=False
)

dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset = dataset[:20000]
dataset = datasets.Dataset.from_dict(dataset)
dataset.set_format("pytorch")
dataset

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
rf_model_path = "./reward-model"
model = AutoModelForCausalLMWithValueHead.from_pretrained(rf_model_path)
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(rf_model_path)
tokenizer = AutoTokenizer.from_pretrained(rf_model_path)

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate)
ppo_trainer = PPOTrainer(
    config=config,
    model=model,
    ref_model=model_ref,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer
)

In [None]:
ctrl_str = ["[negative]", "[positive]"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ctrl_tokens = dict((s, tokenizer.encode(s, return_tensors="pt").squeeze().to(device)) for s in ctrl_str)

In [None]:
def pos_logit_to_reward(logit, task):
    """
    Take the positive sentiment logit and scale it for the task.
    task [negative]: reward = -logit
    task [neutral]: reward = -2 * abs(logit) + 4
    task [positive]: reward = logit
    """
    for i in range(len(logit)):
        if task[i] == "[negative]":
            logit[i] = -logit[i]
        elif task[i] == "[positive]":
            pass
        else:
            raise ValueError("task should be in [0, 1, 2]")
    return logit

In [None]:
pos_logit_to_reward(torch.Tensor([4, 4]), ctrl_str)

In [None]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1
}

In [None]:
def get_score(model, tokenizer, responses):
    positive_logits = []
    for i in responses:
        instructions = tokenizer.encode_plus(
            i,
            padding="max_length",
            max_length=32,
            return_tensor="pt"
        )
        with torch.no_grad():
            outputs = model(**instructions)
        logits = outputs[0].mean()
        positive_logits.append(logits)
    return positive_logits

In [None]:
for epoch in range(1):
    for batch in tqdm(ppo_trainer.dataloader):
        (logs, game_data, ) = (dict(), dict(), )
        print(ctrl_str)
        
        task_list = choices(ctrl_str, k=config.batch_size)
        game_data["query"] = [t + q for t, q in zip(task_list, batch["query"])]
        query_tensors = [torch.cat((ctrl_tokens[t], input_ids)) for t, input_ids in zip(task_list, batch["input_ids"])]
        
        response_tensors = []
        for query in query_tensors:
            response = ppo_trainer.generate(query, **generation_kwargs)
            response_tensors.append(response.squeeze()[-txt_out_len:])
        print(response_tensors)
        game_data["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
        
        texts = [q + r for q, r in zip(batch["query"], game_data["query"])]
        logits = get_score(model, tokenizer, texts)
        rewards = pos_logit_to_reward(logits, task_list)
        
        t = time.time()
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        
        for cs in ctrl_str:
            key = "env/reward_" + cs.strip("[]")
            stats[key] = np.mean([r.cpu().numpy() for r, t in zip(rewards, task_list) if t == cs])
        
        ppo_trainer.log_stats(stats, game_data, rewards)

In [None]:
model.save_pretrained("./RLHF Model")
tokenizer.save_pretrained("./RLHF Model")

In [None]:
MODEL_PATH = "./RLHF Model"
pipe = pipeline("text-generation", model=MODEL_PATH, 
                tokenizer=MODEL_PATH, max_length=100, 
                num_return_sequences=5)