In [2]:
# üß© Causal LM SFT with LoRA on KoGPT2
!pip -q install -U transformers datasets peft accelerate sentencepiece

import os, random, numpy as np, torch
from dataclasses import dataclass
from typing import Dict, List
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType

# ========== 0) Repro & perf ==========
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cuda.matmul.allow_tf32 = True

# ========== 1) Tokenizer / Model ==========
BASE_MODEL = "skt/kogpt2-base-v2"
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)

# ÌÖúÌîåÎ¶øÏö© ÌäπÏàò ÌÜ†ÌÅ∞(ÌÜ†ÌÅ∞ Í≤ΩÍ≥Ñ ÏïàÏ†ïÌôî)
B_INST = "### Instruction:"
B_RESP = "### Response:"
SPECIAL_TOKENS = {"additional_special_tokens": [B_INST, B_RESP]}
tok.add_special_tokens(SPECIAL_TOKENS)

if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
model.resize_token_embeddings(len(tok))

# ========== 2) LoRA ÏÑ§Ï†ï ==========
peft_conf = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
    task_type=TaskType.CAUSAL_LM,
    # GPT-2 Í≥ÑÏó¥ Ìò∏Ìôò ÌÉÄÍπÉ Î™®Îìà
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"]
)
model = get_peft_model(model, peft_conf)
model.print_trainable_parameters()

# ========== 3) ÏÜåÌòï ÏßÄÏãúÎ¨∏ Îç∞Ïù¥ÌÑ∞ÏÖã ==========
pairs = [
    {"prompt":"ÎÇ†Ïî® ÏöîÏïΩ Í∑úÏπô: 1) Ìïú Ï§Ñ 2) Ïù¥Î™®ÏßÄ Í∏àÏßÄ\nÏÑúÏö∏ Ïò§Îäò ÎÇ†Ïî® ÏïåÎ†§Ï§ò.",
     "response":"ÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®ºÏßÄ Î≥¥ÌÜµÏûÖÎãàÎã§."},
    {"prompt":"Ìïú Ï§ÑÎ°ú ÏöîÏïΩ: 'ÎåÄÏ§ëÍµêÌÜµ ÏöîÍ∏à Ïù∏ÏÉÅ ÎÖºÏùòÍ∞Ä ÏßÑÌñâ Ï§ëÏù¥Îã§.'",
     "response":"ÎåÄÏ§ëÍµêÌÜµ ÏöîÍ∏à Ïù∏ÏÉÅÏù¥ ÎÖºÏùò Îã®Í≥ÑÏóê ÏûàÎã§."},
    {"prompt":"Í∞ÑÎã® Î≤àÏó≠: 'ÏÇ¨Í≥ºÎäî Í±¥Í∞ïÏóê Ï¢ãÎã§' -> ÏòÅÏñ¥",
     "response":"Apples are good for health."},
    {"prompt":"ÎπÑÏ¶àÎãàÏä§ Ïù¥Î©îÏùº Ï≤´ Î¨∏Ïû• Ï†úÏïà(ÌïúÍµ≠Ïñ¥, Í≥µÏÜêÏ≤¥): ÎÇ©Í∏∞ Ïó∞Ïû• ÏöîÏ≤≠",
     "response":"ÏïàÎÖïÌïòÏÑ∏Ïöî, Í∑ÄÏÇ¨ ÌîÑÎ°úÏ†ùÌä∏Ïùò ÎÇ©Í∏∞ ÏùºÏ†ï Í¥ÄÎ†®ÌïòÏó¨ Ï°∞Ïã¨Ïä§ÎüΩÍ≤å Ïó∞Ïû•ÏùÑ ÏöîÏ≤≠ÎìúÎ¶ΩÎãàÎã§."},
]

def format_example(p, r, eos):
    return f"{B_INST}\n{p}\n\n{B_RESP}\n{r}{eos}"

train_texts = [format_example(d["prompt"], d["response"], tok.eos_token) for d in pairs]

# ÏÜåÎüâ Îç∞Ïù¥ÌÑ∞ ‚Üí upsamplingÏúºÎ°ú ÏàòÎ†¥ ÏïàÏ†ïÌôî
REPEAT = 60  # ÌïÑÏöî Ïãú 30~200ÏóêÏÑú Ï°∞Ï†à
train_texts = train_texts * REPEAT

raw_ds = Dataset.from_dict({"text": train_texts})

# ========== 4) ÌÜ†ÌÅ¨ÎÇòÏù¥Ï¶à + Î†àÏù¥Î∏î ÎßàÏä§ÌÇπ(ÏùëÎãµÎßå loss) ==========
def build_features(batch):
    texts = batch["text"]
    input_ids_list, attn_list, labels_list = [], [], []
    # "### Response:\n" ÌÜ†ÌÅ∞ ÏãúÌÄÄÏä§
    resp_tag_ids = tok(B_RESP + "\n", add_special_tokens=False)["input_ids"]

    def find_subseq(seq, sub):
        L, l = len(seq), len(sub)
        for i in range(L - l + 1):
            if seq[i:i+l] == sub:
                return i
        return -1

    for t in texts:
        enc = tok(t, max_length=512, truncation=True)
        input_ids = enc["input_ids"]
        attn = enc["attention_mask"]

        idx = find_subseq(input_ids, resp_tag_ids)
        if idx == -1:
            # ÏïàÏ†ÑÏû•Ïπò: ÌÉúÍ∑∏Î•º Î™ª Ï∞æÏúºÎ©¥ Ï†ÑÏ≤¥ -100
            labels = [-100] * len(input_ids)
        else:
            start = idx + len(resp_tag_ids)
            labels = [-100] * start + input_ids[start:]

        input_ids_list.append(input_ids)
        attn_list.append(attn)
        labels_list.append(labels)

    return {"input_ids": input_ids_list, "attention_mask": attn_list, "labels": labels_list}

ds_tok = raw_ds.map(build_features, batched=True, remove_columns=["text"])

# ========== 5) Collator: labelsÎäî ÏàòÎèô Ìå®Îî© ==========
@dataclass
class ResponseOnlyCollator:
    tokenizer: AutoTokenizer
    pad_to_multiple_of: int = 8

    def __call__(self, features: List[Dict]):
        # 1) labelsÎ•º Ïû†Ïãú Î∂ÑÎ¶¨Ìï¥ tokenizer.padÍ∞Ä Í±¥ÎìúÎ¶¨ÏßÄ ÏïäÍ≤å Ìï®
        labels_list = [f.pop("labels") for f in features]

        # 2) ÏûÖÎ†•Îßå Ìå®Îî©
        batch = self.tokenizer.pad(
            features,
            padding=True,
            max_length=None,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )

        # 3) labels ÏàòÎèô Ìå®Îî©(-100) ÌõÑ ÌÖêÏÑúÌôî
        max_len = batch["input_ids"].size(1)
        padded_labels = []
        for lab in labels_list:
            if len(lab) < max_len:
                lab = lab + [-100] * (max_len - len(lab))
            else:
                lab = lab[:max_len]
            padded_labels.append(lab)
        batch["labels"] = torch.tensor(padded_labels, dtype=torch.long)
        return batch

collator = ResponseOnlyCollator(tok)

# ========== 6) ÌïôÏäµ ÏÑ∏ÌåÖ ==========
try:
    bf16_ok = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
except Exception:
    bf16_ok = False
fp16_ok = torch.cuda.is_available() and not bf16_ok

args = TrainingArguments(
    output_dir="./kogpt2-lora-sft",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,          # ÏÜåÎüâ Îç∞Ïù¥ÌÑ∞ ‚Üí ÎÇÆÍ≤å
    num_train_epochs=8,          # ÌïÑÏöî Ïãú 6~20 ÏÇ¨Ïù¥ÏóêÏÑú Ï°∞Ï†à
    lr_scheduler_type="cosine",
    weight_decay=0.0,
    logging_steps=10,
    save_strategy="no",
    bf16=bf16_ok,
    fp16=fp16_ok,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok,
    tokenizer=tok,
    data_collator=collator
)

trainer.train()

# ========== 7) Ï∂îÎ°† Ïú†Ìã∏ (ÌïôÏäµ ÌÖúÌîåÎ¶øÍ≥º ÎèôÏùº) ==========
def generate(prompt, max_new_tokens=80, do_sample=False, top_p=0.9, temperature=0.7):
    text = f"{B_INST}\n{prompt}\n\n{B_RESP}\n"
    inputs = tok(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,    # ÏÜåÎüâ Îç∞Ïù¥ÌÑ∞ Í≥ºÏ†ÅÌï© ‚Üí Í∏∞Î≥∏ FalseÎ°ú Î≥¥ÏàòÏ†Å ÏÉùÏÑ±
            top_p=top_p, temperature=temperature,
            pad_token_id=tok.eos_token_id,
            eos_token_id=tok.eos_token_id
        )
    full = tok.decode(out[0], skip_special_tokens=False)
    # ÏùëÎãµ Î∂ÄÎ∂ÑÎßå Ï∂îÏ∂ú
    if B_RESP in full:
        ans = full.split(B_RESP, 1)[-1].strip()
    else:
        ans = full
    return ans.strip()

print("=== Îç∞Î™® Ï∂úÎ†• ===")
tests = [
    "ÏÑúÏö∏ ÎÇ¥Ïùº ÎÇ†Ïî®Î•º Ìïú Ï§ÑÎ°ú ÏöîÏïΩÌï¥Ï§ò.",
    "Ï†ïÏ§ëÌïú ÏùºÏ†ï Ï°∞Ïú® Î©îÏùº Ï≤´ Î¨∏Ïû• Ïç®Ï§ò.",
    "Ìïú Ï§ÑÎ°ú ÏöîÏïΩ: 'ÎèÑÎ°ú ÌôïÏû• Í≥µÏÇ¨Í∞Ä ÏßÄÏó∞ÎêòÍ≥† ÏûàÎã§.'",
    "Í∞ÑÎã® Î≤àÏó≠: 'Ìè¨ÎèÑÎäî Ìï≠ÏÇ∞Ìôî Ìö®Í≥ºÍ∞Ä ÏûàÎã§' -> ÏòÅÏñ¥",
]
for p in tests:
    print(p, "->", generate(p))


trainable params: 1,179,648 || all params: 126,345,984 || trainable%: 0.9337


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 51200, 'bos_token_id': 51200, 'pad_token_id': 51200}.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.413
20,3.1817
30,2.1255
40,1.423
50,1.092
60,0.9426
70,0.8502
80,0.8615
90,0.8439
100,0.8339


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== Îç∞Î™® Ï∂úÎ†• ===
ÏÑúÏö∏ ÎÇ¥Ïùº ÎÇ†Ïî®Î•º Ìïú Ï§ÑÎ°ú ÏöîÏïΩÌï¥Ï§ò. -> ÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®ºÏßÄ Î≥¥ÌÜµÏûÖÎãàÎã§. ^^ „ÄèÍ≥† ÎßêÌï©ÎãàÎã§. ^^ „ÄÇÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®ºÏßÄ Î≥¥ÌÜµÏûÖÎãàÎã§. ^^ „ÄÇÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®ºÏßÄ Î≥¥ÌÜµÏûÖÎãàÎã§. ^^ „ÄÇÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®ºÏßÄ Î≥¥ÌÜµÏûÖÎãàÎã§. ^^ „ÄÇÏÑúÏö∏ÏùÄ ÎßëÍ≥† ÎÇÆÍ∏∞Ïò® 28ÎèÑ, ÎØ∏ÏÑ∏Î®º
Ï†ïÏ§ëÌïú ÏùºÏ†ï Ï°∞Ïú® Î©îÏùº Ï≤´ Î¨∏Ïû• Ïç®Ï§ò. -> ÏïàÎÖïÌïòÏÑ∏Ïöî, Í∑ÄÏÇ¨ ÌîÑÎ°úÏ†ùÌä∏Ïùò ÏÑ±Í≥µÏùÑ Í±∞ÎëêÏóàÏäµÎãàÎã§. health.presented.go.kr/spectes are good for health.go.kr/spectes.go.kr/spare good for health.go.kr)ÏóêÏÑú ÌôïÏù∏ÌïòÏÑ∏Ïöî. health.go.kr/spectes.go.kr
Ìïú Ï§ÑÎ°ú ÏöîÏïΩ: 'ÎèÑÎ°ú ÌôïÏû• Í≥µÏÇ¨Í∞Ä ÏßÄÏó∞ÎêòÍ≥† ÏûàÎã§.' -> ÎèÑÎ°ú ÌôïÏû• Í≥µÏÇ¨Í∞Ä ÏßÄÏó∞ÎêòÏóàÎã§."
ÎèÑÎ°ú ÌôïÏû• Í≥µÏÇ¨Í∞Ä ÏßÄÏó∞ÎêòÍ≥† ÏûàÎã§. health. health. health. health. health. health. health. health. health. health. health. health. health. heal
Í∞ÑÎã® Î≤àÏó≠: 'Ìè¨ÎèÑÎäî Ìï≠ÏÇ∞Ìôî Ìö®Í