In [2]:
!pip install trl sacrebleu



In [3]:
import os, json, random
from pathlib import Path
from typing import Dict, Any, List

import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, PeftModel
import trl
from trl import SFTTrainer, SFTConfig
import sacrebleu

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print("device:", device)
print("torch:", torch.__version__)
print("trl:", trl.__version__)


device: cuda
torch: 2.9.0+cu126
trl: 0.26.1


In [6]:
import json
from pathlib import Path
from datasets import Dataset

DATA_PATH = Path("train_data.json")

# 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú
text = DATA_PATH.read_text(encoding="utf-8").strip()

if not text:
    raise RuntimeError("train_data.jsonÏù¥ ÎπÑÏñ¥ ÏûàÏäµÎãàÎã§.")

raw_data = []

try:
    # Ìïú Ï§ÑÏî© ÏùΩÍ∏∞ (JSONL) ÏãúÎèÑ
    for line in text.splitlines():
        if line.strip():
            raw_data.append(json.loads(line))
except json.JSONDecodeError:
    raw_data = json.loads(text)

# 2. Îç∞Ïù¥ÌÑ∞ Ï†ïÏ†ú Ìï®Ïàò
def normalize_item(item: dict) -> dict:
    # idiom ÎîïÏÖîÎÑàÎ¶¨ÏóêÏÑú termÍ≥º meaningÏùÑ Ï∂îÏ∂úÌïòÏó¨ ÏÉÅÏúÑ Î†àÎ≤®Î°ú Ïò¨Î¶º
    idiom_data = item.get("idiom", {})

    return {
        "instruction": item["instruction"].strip(),
        "input": item["input"].strip(),
        "output": item["output"].strip(),
        "term": idiom_data.get("term", "").strip(),
        "meaning": idiom_data.get("meaning", "").strip()
    }

# 3. Îç∞Ïù¥ÌÑ∞ÏÖã ÏÉùÏÑ±
norm = [normalize_item(x) for x in raw_data]
ds = Dataset.from_list(norm)

print(f"‚úÖ Ï†ÑÏ≤¥ ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: {len(ds)}")

print("ÏòàÏãú Îç∞Ïù¥ÌÑ∞ Ìï≠Î™© (Ï≤´ Î≤àÏß∏):")
if len(ds) > 0:
    print(ds[0])

‚úÖ Ï†ÑÏ≤¥ ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ Í∞úÏàò: 600
ÏòàÏãú Îç∞Ïù¥ÌÑ∞ Ìï≠Î™© (Ï≤´ Î≤àÏß∏):
{'instruction': "Don't translate it in Korean, but translate it according to Korean culture", 'input': 'I actually saw him at the mall yesterday, no cap.', 'output': 'ÎÇò Ïñ¥Ï†ú ÏáºÌïëÎ™∞ÏóêÏÑú Í±î ÏßÑÏßúÎ°ú Î¥§Îã§ÎãàÍπå, Íµ¨Îùº ÏïÑÎãò.', 'term': 'No cap', 'meaning': "ÏßÑÏßúÏïº, Í±∞ÏßìÎßê ÏïÑÎãàÏïº ('Cap'ÏùÄ Í±∞ÏßìÎßêÏùÑ ÎúªÌï®)"}


In [8]:
from typing import Dict, Any

SYSTEM_PROMPT = """
You are a professional translator who specializes in translating English slang and memes into Korean internet slang and trendy expressions.
Your goal is to make the translation sound like a "close friend" or a "Korean netizen" speaking.

**CRITICAL RULES:**
1. **Never use polite language (Honorifics/Jon-dae-mal).** Use ONLY casual speech (Banmal).
2. Do not translate literally. Use Korean slang, memes, and community vibes aggressively.
3. If the original text is sarcastic or rude, preserve that tone perfectly.
4. Output ONLY the Korean translation. No explanations.
"""

def build_prompt(example: Dict[str, Any]) -> str:
    ref = ""
    # termÏù¥ÎÇò meaningÏù¥ ÏûàÎäî Í≤ΩÏö∞ÏóêÎßå Ï∞∏Í≥† Ï†ïÎ≥¥Î•º Ï∂îÍ∞Ä
    if example.get("term") or example.get("meaning"):
        ref = f"[Note]\n- Expression: {example.get('term','')}\n- Meaning: {example.get('meaning','')}\n\n"

    user = (
        f"ÏßÄÏãúÏÇ¨Ìï≠: {example['instruction']}\n\n"
        f"{ref}"
        f"ÏûÖÎ†• Î¨∏Ïû•: {example['input']}\n"
        f"Ï∂úÎ†• Ï°∞Í±¥: ÌïúÍµ≠Ïñ¥ Î≤àÏó≠ 1Í∞úÎßå Ï∂úÎ†•"
    )

    return (
        "### System:\n"
        f"{SYSTEM_PROMPT.strip()}\n\n"
        "### User:\n"
        f"{user}\n\n"
        "### Assistant:\n"
    )

def to_prompt_completion(ex: Dict[str, Any]) -> Dict[str, str]:
    return {"prompt": build_prompt(ex), "completion": ex["output"]}

# [ÏàòÏ†ïÎê®] dsÍ∞Ä Dataset Í∞ùÏ≤¥Ïù¥ÎØÄÎ°ú ["train"] ÏóÜÏù¥ Î∞îÎ°ú column_namesÏóê Ï†ëÍ∑ºÌï©ÎãàÎã§.
pc = ds.map(to_prompt_completion, remove_columns=ds.column_names)

# [ÏàòÏ†ïÎê®] Í≤∞Í≥º ÌôïÏù∏ ÏãúÏóêÎèÑ ["train"] ÌÇ§ ÏóÜÏù¥ Ïù∏Îç±Ïä§Î°ú Î∞îÎ°ú Ï†ëÍ∑ºÌï©ÎãàÎã§.
print(pc[0]["prompt"][:600])
print("completion:", pc[0]["completion"])

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

### System:
You are a professional translator who specializes in translating English slang and memes into Korean internet slang and trendy expressions.
Your goal is to make the translation sound like a "close friend" or a "Korean netizen" speaking.

**CRITICAL RULES:**
1. **Never use polite language (Honorifics/Jon-dae-mal).** Use ONLY casual speech (Banmal).
2. Do not translate literally. Use Korean slang, memes, and community vibes aggressively.
3. If the original text is sarcastic or rude, preserve that tone perfectly.
4. Output ONLY the Korean translation. No explanations.

### User:
ÏßÄÏãúÏÇ¨Ìï≠:
completion: ÎÇò Ïñ¥Ï†ú ÏáºÌïëÎ™∞ÏóêÏÑú Í±î ÏßÑÏßúÎ°ú Î¥§Îã§ÎãàÍπå, Íµ¨Îùº ÏïÑÎãò.


In [11]:
from google.colab import drive
drive.mount('/content/drive')

OUTPUT_DIR = "/content/drive/MyDrive/V3_hyperclova-translator-mps"
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"

# 1. ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,  # MPS ÏïàÏ†ïÏÑ±
    trust_remote_code=True,
    device_map="auto"
)
base_model.config.use_cache = False
print("‚úÖ Î™®Îç∏ Î°úÎìú ÏôÑÎ£å")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Î™®Îç∏ Î°úÎìú ÏôÑÎ£å


In [17]:
preferred_targets = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

def guess_target_modules(model) -> List[str]:
    names = set()
    for n,_ in model.named_modules():
        names.add(n.split(".")[-1])
    return [t for t in preferred_targets if t in names]

targets = guess_target_modules(base_model)
if not targets:
    targets = "all-linear"
    print("‚ö†Ô∏è target_modules ÏûêÎèô ÌÉêÏßÄ Ïã§Ìå® ‚Üí all-linear ÏÇ¨Ïö©")
else:
    print("‚úÖ target_modules:", targets)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=targets,
    bias="none",
    task_type="CAUSAL_LM",
)

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    logging_steps=50,
    eval_strategy="no",
    #eval_steps=50,
    save_strategy="steps",
    save_steps=200,
    bf16=False,
    fp16=True,
    optim ="adamw_torch",
    report_to="none",
    packing=False,
    seed=SEED,
    completion_only_loss=True,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=base_model,
    args=training_args,
    train_dataset=pc,
    peft_config=peft_config,
)

print("‚úÖ trainer Ï§ÄÎπÑ ÏôÑÎ£å")


‚úÖ target_modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']




Adding EOS to train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/600 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úÖ trainer Ï§ÄÎπÑ ÏôÑÎ£å


In [18]:
print("üöÄ ÌïôÏäµ ÏãúÏûë")
trainer.train()
print("‚úÖ ÌïôÏäµ Ï¢ÖÎ£å")

trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"üéâ Ï†ÄÏû• ÏôÑÎ£å: {OUTPUT_DIR}")


üöÄ ÌïôÏäµ ÏãúÏûë


Step,Training Loss
50,2.7995
100,1.8136
150,1.6034
200,0.9403
250,0.6521
300,0.429
350,0.1639
400,0.1368
450,0.0692
500,0.0269


‚úÖ ÌïôÏäµ Ï¢ÖÎ£å
üéâ Ï†ÄÏû• ÏôÑÎ£å: /content/drive/MyDrive/V3_hyperclova-translator-mps


In [None]:
import shutil
import os

source = '/content/V3_hyperclova-translator-mps' # ÏòÆÍ∏∏ Ìè¥Îçî Ïù¥Î¶Ñ
destination = '/content/drive/MyDrive/my_lab_folder' # ÎìúÎùºÏù¥Î∏å ÎÇ¥ Ï†ÄÏû•Ìï† Í≤ΩÎ°ú

# Ìè¥Îçî Ïù¥Îèô Ïã§Ìñâ
try:
    shutil.move(source, destination)
    print(f"Ïù¥Îèô ÏôÑÎ£å: {destination}")
except Exception as e:
    print(f"Ïò§Î•ò Î∞úÏÉù: {e}")

Ïù¥Îèô ÏôÑÎ£å: /content/drive/MyDrive/my_lab_folder
