In [3]:
import copy
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

# GPU가 있으면 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model_name = "meta-llama/Llama-3.2-1B"
base_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRA 설정: r, alpha, dropout 등은 상황에 맞게 조절합니다.
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["mlp.down_proj",
                    "mlp.gate_proj",
                    "mlp.up_proj",
                    "self_attn.k_proj",
                    "self_attn.o_proj",
                    "self_attn.q_proj",
                    "self_attn.v_proj"],  # 모델에 따라 target 모듈 이름이 다를 수 있습니다.
    lora_dropout=0.1,
    bias="none"
)

# base_model에 LoRA 어댑터 적용 (원래 파라미터는 동결되고, LoRA 파라미터만 업데이트됨)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
model.to(device)

# 파인튜닝 동안 KL 계산을 위한 기준 모델(reference model)은 사전학습된 원 모델을 그대로 사용합니다.
ref_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
ref_model.eval()  # 기준 모델은 업데이트하지 않습니다.


trainable params: 2,818,048 || all params: 1,238,632,448 || trainable%: 0.2275


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [4]:

#############################
# 2. 더미 RL 데이터셋 정의
#############################

class DummyDataset(Dataset):
    """
    간단한 더미 데이터셋: 
      - 고정된 프롬프트("Hello, how are you?")를 토크나이즈하여 입력으로 사용합니다.
      - 행동(actions)은 입력 토큰과 동일하게 설정 (실제 상황에서는 모델이 생성한 행동)
      - old_log_probs와 advantages는 더미 값입니다.
    """
    def __init__(self, tokenizer, num_samples=100, seq_len=16):
        self.tokenizer = tokenizer
        self.num_samples = num_samples
        self.seq_len = seq_len
        self.prompt = "Hello, how are you?"

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.prompt,
            return_tensors="pt",
            max_length=self.seq_len,
            padding="max_length",
            truncation=True,
        )
        input_ids = encoding["input_ids"].squeeze(0)         # [seq_len]
        attention_mask = encoding["attention_mask"].squeeze(0)   # [seq_len]
        # 더미 행동: 여기서는 단순히 input_ids를 행동으로 사용
        actions = input_ids.clone()
        # 더미 old_log_probs 및 advantages (각 토큰에 대해)
        old_log_probs = torch.randn(self.seq_len)
        advantages = torch.randn(self.seq_len)
        # 더미 reward (추후 확장 시 활용)
        rewards = torch.randn(self.seq_len)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "actions": actions,
            "old_log_probs": old_log_probs,
            "advantages": advantages,
            "rewards": rewards,
        }


In [8]:
from transformers import AutoTokenizer
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForCausalLM

model_name = "Qwen/Qwen1.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

dataset = load_dataset("trl-lib/tldr", split="train")

def reward_len(completions, **kwargs):
    return [-abs(20 - len(completion)) for completion in completions]
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", logging_steps=10)
trainer = GRPOTrainer(
    model=model,
    reward_funcs=reward_len,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,  
)


TypeError: GRPOTrainer.__init__() got an unexpected keyword argument 'tokenizer'

In [None]:

# 학습 시작
trainer.train()
