In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Reward Model

In [2]:
!pip install transformers==4.40.2
!pip install datasets==2.19.1
!pip install accelerate==0.30.1
!pip install trl==0.8.6

Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.1
    Uninstalling transformers-4.41.1:
      Successfully uninstalled transformers-4.41.1
Successfully installed transformers-4.40.2
Collecting datasets==2.19.1
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.1)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.19.1)
  Downloading xxhash-3.4.1-cp310-cp310-manylinu

In [3]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
rm_tokenizer = AutoTokenizer.from_pretrained("./drive/MyDrive/Colab/T5-jeju/data/saved_models/roberta_rm")
rm_model = AutoModelForSequenceClassification.from_pretrained("./drive/MyDrive/Colab/T5-jeju/data/saved_models/roberta_rm", num_labels=2).to(device)

PPO

In [6]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl.core import LengthSampler
import multiprocessing

In [7]:
#####
file_path = '/content/drive/My Drive/Colab/T5-jeju/data/filtered_train_ppo_2.tsv'

train_df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip')

num_train = 50000
num_valid = 1000
sliced_train_df = train_df.iloc[:num_train]
sliced_valid_df = train_df.iloc[num_train:num_train+num_valid]
sliced_train_df.to_csv("sliced_train.tsv", sep='\t', index=False)
sliced_valid_df.to_csv("sliced_valid.tsv", sep='\t', index=False)

data_files = {"train": "sliced_train.tsv", "valid": "sliced_valid.tsv"}
train_dataset =  load_dataset("csv", data_files=data_files, delimiter="\t")

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

In [8]:
max_token_length = 64
NUM_CPU = multiprocessing.cpu_count()

In [9]:
def convert_examples_to_features(examples):
    model_inputs = tokenizer(examples['dialect_form'],
                             text_target=examples['standard_form'],
                             max_length=max_token_length, truncation=True)

    return model_inputs

In [10]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab/T5-jeju/data/saved_models/mixed_plus_multi/results")

# tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = train_dataset.map(convert_examples_to_features,
                                 batched=True,
                                 # 이걸 쓰지 않으면 원 데이터 'en', 'ko'가 남아서
                                 # 아래서 콜레이터가 패딩을 못해서 에러남
                                 remove_columns=train_dataset["train"].column_names,
                                 num_proc=NUM_CPU)
###

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [11]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})


In [12]:
model_name = "/content/drive/MyDrive/Colab/T5-jeju/data/saved_models/mixed_plus_multi/results"
config = PPOConfig(
    model_name=model_name,
    learning_rate=1.41e-5,
    batch_size=128,
    mini_batch_size=128,
    ppo_epochs=1
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [13]:
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(config.model_name).to(device)
ref_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(config.model_name).to(device)
# tokenizer = AutoTokenizer.from_pretrained(config.model_name)

In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
sen = ["집 앞에 손 매는 건 집에 아픈 사람이 있거나 굿을 하거나 다 이유가 있지", "집 앞에 손 매는 것은 집에 아픈 사람이 있거나 굿을 하거나 다 이유가 있지"]
tok = rm_tokenizer(sen, padding=True, truncation=True, return_tensors="pt")
tok = {k: v.to(device) for k, v in tok.items()}
output = rm_model(**tok).logits
output2 = [torch.tensor([i]) for i in output[:, 1]]
print(sen[0])
print(output2[0])
print(sen[1])
print(output2[1])


집 앞에 손 매는 건 집에 아픈 사람이 있거나 굿을 하거나 다 이유가 있지
tensor([3.4973])
집 앞에 손 매는 것은 집에 아픈 사람이 있거나 굿을 하거나 다 이유가 있지
tensor([3.5925])


In [None]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=tokenized_datasets['train'], data_collator=data_collator)

In [None]:
import re
import torch

In [None]:
sen = ["안녕하십니까까까까", "녕안하하하하하하하하핳하하하하하하하하하하하", "이하하"]
tok = rm_tokenizer(sen, padding=True, truncation=True, return_tensors="pt")
tok = {k: v.to(device) for k, v in tok.items()}
output = rm_model(**tok).logits
output2 = [torch.tensor([i]) for i in output[:, 1]]
print(output)
print(output2)

tensor([[-1.9446,  1.8862],
        [ 1.4886, -1.8402],
        [ 0.4976, -0.8518]], device='cuda:0', grad_fn=<AddmmBackward0>)
[tensor([1.8862]), tensor([-1.8402]), tensor([-0.8518])]


In [None]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [None]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


pattern = r'<pad>|</s>|<unk>'

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader), total=len(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        # response = ppo_trainer.generate(query, **generation_kwargs)
        response = ppo_trainer.generate(query, max_length=max_token_length, num_beams=5)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [re.sub(pattern, '', tokenizer.decode(r.squeeze(), skip_special_tokens=True)) for r in response_tensors]
    # print(batch["response"])
    tok = rm_tokenizer(batch["response"], padding=True, truncation=True, return_tensors="pt")
    tok = {k: v.to(device) for k, v in tok.items()}
    with torch.no_grad():
      output = rm_model(**tok).logits
    # print(output)
    rewards = [torch.tensor([i]) for i in output[:, 1]]
    # print(rewards)

    temp_q = list(torch.unbind(query_tensors))
    q_t = [q for q in temp_q]

    # print("q_t: ", query_tensors)
    # print("q_t22: ", q_t)
    # print("r_t: ", response_tensors)
    # print("r: ", rewards)
    stats = ppo_trainer.step(q_t, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

100%|██████████| 390/390 [3:24:39<00:00, 31.49s/it]


In [None]:
ppo_trainer.save_pretrained("./drive/MyDrive/Colab/T5-jeju/data/saved_models/rlhf_multi4")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
torch.cuda.empty_cache()