###Prerequisites

In [None]:
! pip install transformers

In [None]:
! pip install trl

In [37]:
import torch
import time
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
tqdm.pandas()

from transformers import AutoTokenizer, GPT2TokenizerFast, pipeline

from trl.core import build_bert_batch_from_txt, listify_batch, respond_to_batch

from trl import PPOTrainer, PPOConfig, DPOTrainer, AutoModelForCausalLMWithValueHead

import tqdm

from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe_device = 0 if torch.cuda.is_available() else -1

####Dataset

In [5]:
ds = load_dataset('imdb', split='train')
ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
ds

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 24895
})

###Positive texts generation

In [6]:
sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 16
}

sentiment_pipe = pipeline("sentiment-analysis","lvwerra/distilbert-imdb", device=pipe_device)

In [7]:
model = AutoModelForCausalLMWithValueHead.from_pretrained("lvwerra/gpt2-imdb")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("lvwerra/gpt2-imdb")

tokenizer = AutoTokenizer.from_pretrained("lvwerra/gpt2-imdb")
tokenizer.pad_token = tokenizer.eos_token

In [8]:
model.to(device);
model_ref.to(device);

In [9]:
class LengthSampler:
    def __init__(self, min_value, max_value):
        self.values = list(range(min_value, max_value))
    def __call__(self):
        return np.random.choice(self.values)

input_size = LengthSampler(2, 8)
output_size = LengthSampler(4, 16)

In [10]:
def tokenize(sample):
    sample["tokens"] = tokenizer.encode(sample["review"])[:input_size()]
    sample["query"] = tokenizer.decode(sample["tokens"])
    return sample

ds = ds.map(tokenize, batched=False)

In [11]:
gen_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id
}

In [12]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

dataloader = torch.utils.data.DataLoader(ds, batch_size=16, collate_fn=collator)

In [None]:
ppo_config = {"batch_size": 16}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)

for batch in dataloader:
  batches += 1
  query_tensors = [torch.tensor(t).long().to(device) for t in batch["tokens"]]
  response_tensors = []
  for i in range(16):
    gen_len = output_size()
    response = model.generate(query_tensors[i].unsqueeze(dim=0),
                                       max_new_tokens=gen_len, **gen_kwargs)
    response_tensors.append(response.squeeze()[-gen_len:])
  batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

  texts = [q + r for q,r in zip(batch['query'], batch['response'])]
  pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
  rewards = torch.tensor([output[1]["score"] for output in pipe_outputs]).to(device)

  stats = ppo_trainer.step(query_tensors, response_tensors, list(rewards.unsqueeze(dim=1)))

In [14]:
stats

{'objective/kl': 6.990373611450195,
 'objective/kl_dist': array([ 9.700549 ,  4.354264 ,  8.873707 , 10.600117 , 10.797907 ,
        20.512352 ,  0.7161677, 11.6799   ,  6.2691326,  9.784451 ,
         3.5457778,  2.067173 ,  5.056112 , -2.6304488,  9.719509 ,
         0.7993088], dtype=float32),
 'objective/logprobs': array([[-2.3046408e+00, -1.4951106e+00, -5.7072926e-01, -5.0022469e+00,
         -6.2227094e-01, -1.3627381e+00, -5.6537974e-01, -5.5349894e+00,
         -1.1662832e+00, -3.3556240e+00, -5.9100503e-01, -3.7657905e+00,
         -1.0222048e+01, -4.1096721e+00, -4.1101480e+00, -4.1095815e+00,
         -4.1653681e+00, -4.2329826e+00, -4.2400308e+00, -4.2558861e+00,
         -4.3354282e+00],
        [-4.1164865e+00, -6.1137700e+00, -6.1350818e+00, -1.2178225e+01,
         -1.8964753e-01, -1.5901858e+00, -1.7377235e+00, -6.0566485e-01,
         -7.9677591e+00, -9.1034897e-02, -4.8136264e-01, -8.1639786e+00,
         -1.5151858e+00, -8.6803663e-01, -1.8727237e+00, -1.0450750e+0

In [None]:
torch.save(model.state_dict(), "positive_model.pth")

In [15]:
text_generation = pipeline("text-generation", model=model, tokenizer = AutoTokenizer.from_pretrained("lvwerra/gpt2-imdb"), device=pipe_device)

The model 'AutoModelForCausalLMWithValueHead' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalL

In [None]:
positive_texts = []
inputs = ds['query'][:100]

for input in inputs:
  generated_text = text_generation(input, max_length=50, do_sample=False)[0]
  positive_texts.append(generated_text["generated_text"])

In [19]:
positive_texts[:5]

['I rented I AM COO and I loved it. I loved it and I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it. I love it',
 '"I Am A Woman" is a great film, and I love it. It is a great film, and it is a great movie. It is a great film, and it is a great movie. It is a great film, and it',
 'If only to avoid making this type of film so great, it is great. The film is great, and the characters are great. The film is a great film, and I love it. It is a great film, and I love it.',
 'This film was probably inspired by the wonderful and wonderful work of the wonderful and wonderful people of the world. I loved this film and I loved it. I loved it. I loved it. I loved it. I loved it. I loved it.',
 'Oh, brother. I love this film. It is a great story, and it is a great movie. It is a great movie, and it is a great movie. I love it. I love it. I love it. I love it']

###Reward calculation

Сгенерируйте N текстов с помощью sft модели, посчитайте reward для каждого с помощью https://huggingface.co/lvwerra/distilbert-imdb. Логиты бинарного классификатора можно использовать в качестве значения reward. Больше значение логита — более позитивный текст.

In [20]:
pipe_outputs = sentiment_pipe(positive_texts, **sent_kwargs)
rewards = torch.tensor([output[1]["score"] for output in pipe_outputs]).to(device)



In [21]:
rewards

tensor([ 2.7890,  2.9261,  2.9231,  2.8709,  2.9057,  2.9291,  2.9269,  2.8812,
         2.7938,  2.9240,  2.8896,  2.9251,  2.8484,  2.9094,  2.9128,  2.9337,
         2.9194,  2.8791,  2.8541,  2.9279,  2.9247,  2.9246,  2.9120,  2.9200,
         2.8403,  2.7824,  2.9156,  2.9184,  2.8371,  2.9048,  2.8095,  2.9226,
         2.8075,  2.9128,  2.7711,  2.8817,  2.7645,  2.8747,  2.7927,  2.9096,
         2.8274,  2.8032,  2.8812,  2.8953,  2.7710, -0.3603,  2.9110,  2.9280,
         2.9189,  2.9047,  2.8205,  2.9100,  2.9106,  2.9351,  2.9353,  2.9123,
         2.8719,  2.9146,  2.9242,  2.7105,  2.9173,  2.9059,  2.8188,  2.8481,
         2.8482,  2.8942,  2.9357,  2.6619,  2.9236,  2.9320,  2.9175,  2.9112,
         2.9147,  2.8978,  2.9142,  2.8498,  2.9041,  2.9190,  2.9138,  2.8776,
         2.9190,  2.7944,  2.9263,  2.8712,  2.9140,  2.9312,  2.7510,  2.9131,
         2.8377,  2.8991,  2.6736,  2.9229,  2.7092,  2.9366,  2.8930,  2.2871,
         2.8371,  2.9128,  2.8240,  2.92

In [22]:
idx_rew = []
idx_text = {}
for i in range(len(rewards)):
  idx_rew.append((i, rewards[i].item()))
  idx_text[i] = positive_texts[i]

####Winner-loser dataset

Представим, что это все -- кандидаты для одного задания ("Сгенерируй позитивный отзыв на фильм").

In [23]:
def winner_loser_dataset(idx_rew, idx_text):
  winners_main = []
  losers_main = []
  rewards = idx_rew[::]
  while len(losers_main) < 99:
    winners, losers, rewards = one_level_comparison(rewards)
    winners_main.extend(winners)
    losers_main.extend(losers)

  winners_text = []
  losers_text = []
  for winner in winners_main:
    winners_text.append(idx_text[winner])
  for loser in losers_main:
    losers_text.append(idx_text[loser])
  return winners_text, losers_text


def one_level_comparison(idx_rew):
  losers = []
  winners = []
  to_delete = []
  for i in range(0, len(idx_rew)-1, 2):
    if idx_rew[i][1] > idx_rew[i+1][1]:
      winners.append(idx_rew[i][0])
      losers.append(idx_rew[i+1][0])
      to_delete.append(idx_rew[i+1])
    else:
      losers.append(idx_rew[i][0])
      winners.append(idx_rew[i+1][0])
      to_delete.append(idx_rew[i])
  rest = idx_rew[::]
  for loser in to_delete:
    rest.remove(loser)

  return winners, losers, rest

In [24]:
winners, losers = winner_loser_dataset(idx_rew, idx_text)
print(len(winners))
print(len(losers))

99
99


In [25]:
di = {'winner': winners, 'loser': losers}

pos_ds = pd.DataFrame(di)

In [26]:
pos_ds

Unnamed: 0,winner,loser
0,"""I Am A Woman"" is a great film, and I love it....",I rented I AM COO and I loved it. I loved it a...
1,If only to avoid making this type of film so g...,This film was probably inspired by the wonderf...
2,I would put this at the top of my list of grea...,"Oh, brother. I love this film. It is a great s..."
3,"Whoever wrote the screenplay for this movie, i...",When I first saw a glimpse of this wonderful f...
4,"This is said to be a great film, it is a great...","Who are these ""They""-like characters, and I lo..."
...,...,...
94,This has to be the best film I have seen in ye...,I have to say I am really impressed with this ...
95,"Not only is this a great film, it is a great f...","There's not just one great film, it's a great ..."
96,This has to be the best film I have seen in ye...,"This film is great, and I love it. It is a gre..."
97,"Not only is this a great film, it is a great f...","This film is great, and it is a great film. It..."


###Model training

Обучите sft модель на полученном датасете с лоссом из статьи. На этом этапе рекомендуется использовать DPOTrainer из библиотеки trl: https://huggingface.co/docs/trl/main/en/dpo_trainer#trl.DPOTrainer. Обратите внимание, что там можно указать loss_type =”hinge”.


In [39]:
data_loader = DataLoader(dpo_dataset_dict, batch_size=8, shuffle=True)

In [27]:
dataset = Dataset.from_pandas(pos_ds)
dataset

Dataset({
    features: ['winner', 'loser'],
    num_rows: 99
})

In [28]:
dataset[:5]

{'winner': ['"I Am A Woman" is a great film, and I love it. It is a great film, and it is a great movie. It is a great film, and it is a great movie. It is a great film, and it',
  'If only to avoid making this type of film so great, it is great. The film is great, and the characters are great. The film is a great film, and I love it. It is a great film, and I love it.',
  'I would put this at the top of my list of great films. It is a great film, and it is a great film. It is a great film, and it is a great film. It is a great film, and it is a',
  'Whoever wrote the screenplay for this movie, it is a great film. It is a great film, and I love it. It is a great film, and I love it. It is a great film, and I love it. It is a',
  'This is said to be a great film, it is a great film, and it is a great film. It is a great film, and it is a great film. It is a great film, and it is a great film. It is'],
 'loser': ['I rented I AM COO and I loved it. I loved it and I love it. I love it. I l

In [34]:
dpo_dataset_dict = {
    "prompt": [
        "" * 99
    ],
    "chosen": list(pos_ds.winner),
    "rejected": list(pos_ds.loser),
}

In [None]:
model = model
model_ref = model

dpo_trainer = DPOTrainer(
    model,
    model_ref,
    beta=0.1,
    train_dataset=data_loader,
    tokenizer=tokenizer,
    loss_type = "hinge"
#    args=training_args,    # training arguments e.g. batch size, lr, etc.
)


dpo_trainer.train()
dpo_trainer.save_model()