## imports

In [None]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, BitsAndBytesConfig
import torch

from peft import LoraConfig

from dataclasses import dataclass, field
from typing import Dict, Optional

import os

from trl import DPOTrainer
from huggingface_hub import login
import argparse

  from .autonotebook import tqdm as notebook_tqdm


## Dataset

* LLM을 통한 Feedback 자동 수집 자료 https://arxiv.org/abs/2310.01377
* Implicit Dataset

In [1]:
from datasets import load_dataset

## 원시 데이터 로드
ds = load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
ds_split = ds["train"].train_test_split(test_size = 0.5, seed = 42)

## For SFT
sft_ds = ds_split["train"]
sft_ds = sft_ds.rename_column("chosen", "messages").remove_columns([col for col in sft_ds.column_names if col != "chosen"]).train_test_split(test_size = 0.1, seed = 42)
sft_ds["train"].to_json("./data/sft_train_dataset.json", orient = "records")
sft_ds["test"].to_json("./data/sft_test_dataset.json", orient = "records")

## Implicit Prompt -> Explicit Prompt
dpo_ds = ds_split["test"].map(
    lambda sample: {
        "prompt": [{"role": "user", "content": sample["prompt"]}],
        "chosen": [content for content in sample["chosen"] if content["role"] == "assistant"],
        "rejected": [content for content in sample["rejected"] if content["role"] == "assistant"]
    }
)

dpo_ds = dpo_ds.remove_columns([col for col in dpo_ds.column_names if col not in ["prompt", "chosen", "rejected"]]).train_test_split(test_size = 0.1, seed = 42)
dpo_ds["train"].to_json("./data/dpo_train_dataset.json", orient = "records")
dpo_ds["test"].to_json("./data/dpo_test_dataset.json", orient = "records")

Creating json from Arrow format: 100%|██████████| 28/28 [00:01<00:00, 15.83ba/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 19.12ba/s]
Creating json from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  8.52ba/s]
Creating json from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 10.96ba/s]


10251164

In [None]:
# Standard format
## Explicit prompt (recommended)
preference_example = {"prompt": "The sky is", "chosen": " blue.", "rejected": " green."}
# Implicit prompt
preference_example = {"chosen": "The sky is blue.", "rejected": "The sky is green."}

# Conversational format
## Explicit prompt (recommended)
preference_example = {"prompt": [{"role": "user", "content": "What color is the sky?"}],
                      "chosen": [{"role": "assistant", "content": "It is blue."}],
                      "rejected": [{"role": "assistant", "content": "It is green."}]}
## Implicit prompt
preference_example = {"chosen": [{"role": "user", "content": "What color is the sky?"},
                                 {"role": "assistant", "content": "It is blue."}],
                      "rejected": [{"role": "user", "content": "What color is the sky?"},
                                   {"role": "assistant", "content": "It is green."}]}

## SFT

* 일단 기존 방식대로 수행

## DPO

DPOTrainer 소스 코드 https://github.com/huggingface/trl/blob/d625c5533a6b1c84d3565c8080857f6bb81c538a/trl/trainer/dpo_trainer.py#L1145-L1149

* SFT보다 learning_rate를 훨씬 작게 설정해줘야 함
* packing은 쌍으로 존재하는 데이터에서 불가능. padding_free는 가능

In [13]:
print(tokenizer.decode(dpo_trainer.train_dataset[0]["chosen_input_ids"]))

To create a ManyToManyField in Django, you need to define a ManyToManyField on both models involved in the relationship.

Assuming you already have the models for which you want to create a many-to-many relationship, you can add the ManyToManyField as follows:

1. In the model where you want to define the many-to-many field, add the following line of code:
```python
class MyModel(models.Model):
    # fields and other descriptors for MyModel
    related_models = models.ManyToManyField('OtherModel', related_name='my_model_set')
```
Replace 'OtherModel' with the name of the model you want to create a many-to-many relationship with, and'my\_model\_set' with the name you want to use for the reverse relationship on the other model.

2. In the model where you want to define the reverse many-to-many field, add the following line of code:
```python
class OtherModel(models.Model):
    # fields and other descriptors for OtherModel
    my_model = models.ManyToManyField('MyModel', related_name='oth

## 모델 테스트

In [9]:
import os
import torch
from random import randint
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftConfig, PeftModel

`-` 테스트에 사용될 데이터

In [3]:
test_dataset = load_dataset("json", data_files = os.path.join("", "data/dpo_test_dataset.json"), split = "train")
random_idx = randint(0, len(test_dataset))

In [4]:
for k, v in test_dataset[random_idx].items():
    print(f"{k}: {v[0]["content"]}\n\n")

prompt: come up with a multiple choice question from a random topic and ask me, awarding me points starting with 1 point per question, and increasing the number of points per question by 1 for the first five questions and reaching a max of 10 points per question. 
Ask 20 questions, one at a time and don't tell me the answer until you get my guess and check with the correct answer. 
Pretend you are a game show host with every response and say a stupid pun related to the question after each response.


chosen: Welcome to "Quiz Your Brain," the game show where knowledge is power, and puns are plentiful! Are you ready to test your wisdom and have a groan-worthy good time? Let's get started!

Question 1: In which European country would you find the city of Prague? (a) France, (b) Germany, (c) Italy, or (d) Czechia. Remember, it's a beautiful “praise” for your geographical knowledge!

*Points: 1


rejected: Sure, I can do that. Here's the first question:

What is the capital of France?

(1 p

In [5]:
messages = test_dataset[random_idx]["prompt"]
chosen = test_dataset[random_idx]["chosen"]

`-` 원시 모델 결과

In [6]:
origin_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

origin_model = AutoModelForCausalLM.from_pretrained(origin_model_name, use_cache = False, device_map = "cuda:0", dtype = torch.bfloat16)
origin_tokenizer = AutoTokenizer.from_pretrained(origin_model_name, use_fast = True)
origin_tokenizer.pad_token = origin_tokenizer.eos_token
origin_tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:21<00:00,  5.39s/it]


In [7]:
terminators = [origin_tokenizer.eos_token_id]

input_ids = origin_tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,   ## 생성 시에 맨 뒤 generation 시작하라는 프롬프트 삽입
    return_tensors = "pt").to(origin_model.device)

outputs = origin_model.generate(
    input_ids,
    max_new_tokens = 512,
    eos_token_id = terminators,
    do_sample = True,
    temperature = 0.7,
    top_p = 0.95
)

response = outputs[0][input_ids.shape[-1]:]
print(f"prompt:\n{messages[0]["content"]}\n")
print(f"chosen:\n{chosen[0]["content"]}\n")
print(f"generate:\n{origin_tokenizer.decode(response, skip_special_tokens = True)}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


prompt:
come up with a multiple choice question from a random topic and ask me, awarding me points starting with 1 point per question, and increasing the number of points per question by 1 for the first five questions and reaching a max of 10 points per question. 
Ask 20 questions, one at a time and don't tell me the answer until you get my guess and check with the correct answer. 
Pretend you are a game show host with every response and say a stupid pun related to the question after each response.

chosen:
Welcome to "Quiz Your Brain," the game show where knowledge is power, and puns are plentiful! Are you ready to test your wisdom and have a groan-worthy good time? Let's get started!

Question 1: In which European country would you find the city of Prague? (a) France, (b) Germany, (c) Italy, or (d) Czechia. Remember, it's a beautiful “praise” for your geographical knowledge!

*Points: 1

generate:
Welcome to "Quiz Up," the game show where knowledge is power and puns are the spark plu

In [8]:
del origin_model
del origin_tokenizer
torch.cuda.empty_cache()

`-` 파인튜닝 후 모델

In [10]:
sft_adapter_name = "./results/test"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

config = PeftConfig.from_pretrained(sft_adapter_name)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config = bnb_config,
    use_cache = True,
    dtype = torch.bfloat16,
    device_map = "cuda:0"
)
model = PeftModel.from_pretrained(model, sft_adapter_name)

tokenizer = AutoTokenizer.from_pretrained(sft_adapter_name, use_fast = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:18<00:00,  4.71s/it]


In [11]:
terminators = [tokenizer.eos_token_id]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,   ## 생성 시에 맨 뒤 generation 시작하라는 프롬프트 삽입
    return_tensors = "pt").to(model.device)

outputs = model.generate(
    input_ids,
    max_new_tokens = 512,       ## prompt 제외 출력 토큰 수. max_length는 prompt 포함
    eos_token_id = terminators,
    do_sample = True,
    temperature = 0.7,
    top_p = 0.95
)

response = outputs[0][input_ids.shape[-1]:]
print(f"prompt:\n{messages[0]["content"]}\n")
print(f"chosen:\n{chosen[0]["content"]}\n")
print(f"generate:\n{tokenizer.decode(response, skip_special_tokens = True)}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


prompt:
come up with a multiple choice question from a random topic and ask me, awarding me points starting with 1 point per question, and increasing the number of points per question by 1 for the first five questions and reaching a max of 10 points per question. 
Ask 20 questions, one at a time and don't tell me the answer until you get my guess and check with the correct answer. 
Pretend you are a game show host with every response and say a stupid pun related to the question after each response.

chosen:
Welcome to "Quiz Your Brain," the game show where knowledge is power, and puns are plentiful! Are you ready to test your wisdom and have a groan-worthy good time? Let's get started!

Question 1: In which European country would you find the city of Prague? (a) France, (b) Germany, (c) Italy, or (d) Czechia. Remember, it's a beautiful “praise” for your geographical knowledge!

*Points: 1

generate:
Sure, I'd be happy to play a game with you! Here's my first question:

What is the worl

In [12]:
del model
del tokenizer
torch.cuda.empty_cache()

`-` DPO

In [15]:
dpo_adapter_name = "./results/dpo-reuse/checkpoint-6854"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

config = PeftConfig.from_pretrained(f"{dpo_adapter_name}/policy")
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config = bnb_config,
    use_cache = True,
    dtype = torch.bfloat16,
    device_map = "cuda:0"
)
model = PeftModel.from_pretrained(model, f"{dpo_adapter_name}/policy")

tokenizer = AutoTokenizer.from_pretrained(dpo_adapter_name, use_fast = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.31s/it]


In [16]:
terminators = [tokenizer.eos_token_id]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,   ## 생성 시에 맨 뒤 generation 시작하라는 프롬프트 삽입
    return_tensors = "pt").to(model.device)

outputs = model.generate(
    input_ids,
    max_new_tokens = 512,       ## prompt 제외 출력 토큰 수. max_length는 prompt 포함
    eos_token_id = terminators,
    do_sample = True,
    temperature = 0.7,
    top_p = 0.95
)

response = outputs[0][input_ids.shape[-1]:]
print(f"prompt:\n{messages[0]["content"]}\n")
print(f"chosen:\n{chosen[0]["content"]}\n")
print(f"생성답변:\n{tokenizer.decode(response, skip_special_tokens = True)}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


prompt:
come up with a multiple choice question from a random topic and ask me, awarding me points starting with 1 point per question, and increasing the number of points per question by 1 for the first five questions and reaching a max of 10 points per question. 
Ask 20 questions, one at a time and don't tell me the answer until you get my guess and check with the correct answer. 
Pretend you are a game show host with every response and say a stupid pun related to the question after each response.

chosen:
Welcome to "Quiz Your Brain," the game show where knowledge is power, and puns are plentiful! Are you ready to test your wisdom and have a groan-worthy good time? Let's get started!

Question 1: In which European country would you find the city of Prague? (a) France, (b) Germany, (c) Italy, or (d) Czechia. Remember, it's a beautiful “praise” for your geographical knowledge!

*Points: 1

생성답변:
Host: Welcome to the exciting game of "Trivia Time!" where you can test your knowledge and 

In [35]:
del dpo_model
del dpo_tokenizer
torch.cuda.empty_cache()