In [None]:
# Colab 셀에서 실행
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-lnq1per4/unsloth_94464c746e3644e28757e0960b4cc7b6
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-lnq1per4/unsloth_94464c746e3644e28757e0960b4cc7b6
  Resolved https://github.com/unslothai/unsloth.git to commit cefdd0f15953fd848b4a675184eab5167e22469e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.12.4 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.12.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.gi

In [None]:
from unsloth import FastLanguageModel
import torch

# ---------------------------------------------------------
model_name = "Qwen/Qwen3-0.6B"
# ---------------------------------------------------------

max_seq_length = 2048 # 문맥 길이
dtype = None # None으로 두면 자동 설정 (Float16/Bfloat16)
load_in_4bit = True # 4bit 양자화로 메모리 절약 (무료 GPU에서 필수)

# 모델 및 토크나이저 불러오기
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_..." # 비공개 모델일 경우 여기에 토큰 입력 필요
    )
    print(f"✅ {model_name} 모델 로드 성공!")
except Exception as e:
    print(f"❌ 모델 로드 실패: {e}")
    print("팁: 해당 모델이 Hugging Face에 실제로 존재하는지, 혹은 오타가 없는지 확인해주세요.")
    print("대체 추천 모델: 'Qwen/Qwen2.5-0.5B-Instruct'")

# LoRA(Low-Rank Adaptation) 설정
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


==((====))==  Unsloth 2025.12.5: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/576M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

✅ Qwen/Qwen3-0.6B 모델 로드 성공!


Unsloth 2025.12.5 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
# =========================================================
# 0) 라이브러리 세팅
# =========================================================
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# =========================================================
# 1) MT 병렬 데이터 로드 (lemon-mint wiki augmented)
#    - columns: english, korean
#    - split: train (503,245)
# =========================================================
mt_raw = load_dataset("lemon-mint/korean_english_parallel_wiki_augmented_v1", split="train")

mt_split = mt_raw.train_test_split(test_size=0.005, seed=3407)  # 0.5% 검증
mt_train = mt_split["train"]
mt_eval  = mt_split["test"]

# =========================================================
# 2) 프롬프트 포맷: Instruction-tuning 스타일로 MT 학습
# =========================================================
EOS_TOKEN = tokenizer.eos_token

def mt_formatting_func(batch):
    texts = []
    for en, ko in zip(batch["english"], batch["korean"]):
        # 너무 긴 문장/이상치 방지용 필터
        if en is None or ko is None:
            continue
        if len(en) < 1 or len(ko) < 1:
            continue

        prompt = (
            "### Instruction:\n"
            "Translate the following English text into Korean.\n\n"
            "### Input:\n"
            f"{en}\n\n"
            "### Output:\n"
            f"{ko}"
        )
        texts.append(prompt + EOS_TOKEN)
    return {"text": texts}

mt_train = mt_train.map(mt_formatting_func, batched=True, remove_columns=mt_train.column_names)
mt_eval  = mt_eval.map(mt_formatting_func,  batched=True, remove_columns=mt_eval.column_names)

print("MT train example:\n", mt_train[0]["text"][:500])

# =========================================================
# 3) 학습 설정 (사전 MT fine-tuning)
#    - 데이터가 많으니 max_steps를 늘리거나 num_train_epochs로 돌리면 됨
#    - 아래는 '가볍게 성능 올려보기' 기준의 안전한 기본값
# =========================================================
mt_trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=mt_train,
    eval_dataset=mt_eval,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,  # 짧은 문장 많은 MT에 보통 이득(메모리/속도). 문제 생기면 False로.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        max_steps=800,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        eval_steps=200,
        save_steps=200,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="outputs_mt_pretrain",
        report_to="none",
    ),
)

mt_stats = mt_trainer.train()
print("✅ MT pre-finetuning done!")

# =========================================================
# 4) MT 사전학습 모델로 간단 테스트
# =========================================================
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)

def translate_en2ko(text_en: str, max_new_tokens: int = 256):
    prompt = (
        "### Instruction:\n"
        "Translate the following English text into Korean.\n\n"
        "### Input:\n"
        f"{text_en}\n\n"
        "### Output:\n"
    )
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
    )
    out = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return out.split("### Output:\n", 1)[-1].strip()

print("\n[MT Test]")
print(translate_en2ko("The quick brown fox jumps over the lazy dog."))


MT train example:
 ### Instruction:
Translate the following English text into Korean.

### Input:
Radio Romania International broadcasts daily shows under the heading "Romania Live": a summary of the shows produced by the main departments in the Romanian Radio Broadcasting Corporation. This summary is broadcast around the clock via satellite, on the internet (in RealAudio), and also on short waves (to Central and Western Europe and Israel), 8 hours per day. RRI 1 has daily broadcasts around the clock in Romanian a


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 124,533 | Num Epochs = 1 | Total steps = 800
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 10,092,544 of 606,142,464 (1.67% trained)


Step,Training Loss
10,2.3466
20,2.3323


In [None]:
# =========================================================
# Stage-2: 내 자체 데이터셋으로 이어서 fine-tuning (실험 조건 유지)
# - MT pre-finetuning 이후 같은 model 객체로 연속 학습
# - 프롬프트/포맷/훈련 조건은 사용자가 준 코드 그대로 유지
# =========================================================

import json
from datasets import Dataset

# 파일 경로
file_path = "/content/data.json"

# 데이터 로드
data = []
try:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
    print(f"데이터 {len(data)}개 로드 완료.")
except FileNotFoundError:
    print("'/content/data.json' 파일을 찾을 수 없습니다. 파일을 업로드해주세요.")

# 1. 시스템 프롬프트 별도 정의 (요청하신 내용 반영)
SYSTEM_PROMPT = (
    "You are an expert Korean translator.\n"
    "You translate English sentences into natural Korean, adapting to Korean internet and youth culture expressions.\n"
    "Do not translate word-for-word; preserve meaning and tone in Korean."
)

# 2. 프롬프트 포맷 정의 (Alpaca 스타일 구조에 {system_prompt} 추가)
alpaca_prompt = """{system_prompt}

### Instruction:
{instruction}

### Input:
{input}

### Output:
{output}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        # 3. 포맷팅 시 SYSTEM_PROMPT를 함께 주입
        text = alpaca_prompt.format(
            system_prompt = SYSTEM_PROMPT,  # 여기서 고정된 시스템 프롬프트 삽입
            instruction   = instruction,
            input         = input,
            output        = output
        ) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# 데이터셋 생성
if data:
    dataset = Dataset.from_list(data)
    dataset = dataset.map(formatting_prompts_func, batched = True)

    # 확인용 출력
    print("\n[포맷팅된 데이터 예시]:")
    print(dataset[0]['text'])

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# (권장) MT stage output_dir과 겹치지 않게 분리
OUTPUT_DIR_STAGE2 = "outputs_stage2_custom"

trainer_stage2 = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = OUTPUT_DIR_STAGE2,
        report_to = "none",
    ),
)

trainer_stage2_stats = trainer_stage2.train()
print("🎉 Stage-2 (custom dataset) 학습 완료!")


In [None]:
FastLanguageModel.for_inference(model)

# 테스트해볼 문장
test_input = "He ghosted me after the first date."
instruction = "직역하지말고 타겟언어 문화권에 맞게 번역해줘"

inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction = instruction,
        input = test_input,
        output = "", # 생성할 부분
    )
], return_tensors = "pt").to("cuda")

# 생성
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
result = tokenizer.batch_decode(outputs)

# 결과만 깔끔하게 출력
print("\n[번역 결과]:")
print(result[0].split("### Output:\n")[1].replace(EOS_TOKEN, ""))


[번역 결과]:
데이트 한 후 아줌마 봐다니.


In [None]:
FastLanguageModel.for_inference(model) # 추론 모드 설정

# 훈련 데이터에 없는 새로운 관용구/밈이 포함된 테스트 문장 리스트 (10개)
test_sentences = [
    "The committee's investigation opened a can of worms regarding the project's funding.",
    "After the scandal was exposed, the corporation had no choice but to face the music.",
    "Trying to argue with him is pointless. It's like flogging a dead horse.",
    "When he realized the project was failing, he just said ‘Jesus, take the wheel,’ and gave up completely.",
    "She tried to fix the issue herself, but it turned into a ‘hold my beer’ moment that made everything worse.",
    "You think you can manage this project alone? Don't bite off more than you can chew.",
    "He always thinks the grass is greener on the other side, even though he’s never tried what he already has.",
    "Trying to get them to finish the report is like herding cats.",
    "The official is accused of cutting deals under the table.",
    "She thinks her crush likes her back - she's so delulu."
]

instruction = "직역하지말고 타겟언어 문화권에 맞게 번역해줘"

print(f"총 {len(test_sentences)}개의 문장에 대해 테스트를 시작합니다.\n" + "="*50)

for i, text in enumerate(test_sentences):
    # 프롬프트 구성
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction = instruction,
            input = text,
            output = "", # 생성할 부분
        )
    ], return_tensors = "pt").to("cuda")

    # 생성 (Stream 없이 한 번에 생성)
    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    result = tokenizer.batch_decode(outputs)

    # 결과 파싱
    translated_text = result[0].split("### Output:\n")[1].replace(EOS_TOKEN, "").strip()

    # 출력
    print(f"[{i+1}] Input: {text}")
    print(f"    Output: {translated_text}")
    print("-" * 50)

print('''
    1. Can of worms: '긁어 부스럼' 혹은 '판도라의 상자'로 번역하는지
    2. Face the music: '현실을 받아들이다' 혹은 '비난을 감수하다'를 어떻게 표현하는지
    3. Flogging a dead horse: '죽은 말 채찍질하기' (뒷북치기/헛수고)
    4. Jesus, take the wheel: '이건 내 손을 떠났다', '신이시여' 같은 밈적 뉘앙스
    5. Hold my beer: '내가 보여줄게' (보통 망하기 직전에 쓰는 밈)
    6. Bite off more than you can chew: '과욕 부리다', '송충이는 솔잎을 먹어야지' 등
    7. Grass is greener: '남의 떡이 더 커 보인다'
    8. Herding cats: '통제가 불가능하다', '제멋대로다'
    9. Under the table: '뒷돈', '암거래'
    10. Delulu: '망상', '김칫국' (Delusional의 줄임말, 최신 슬랭''')

총 10개의 문장에 대해 테스트를 시작합니다.
[1] Input: The committee's investigation opened a can of worms regarding the project's funding.
    Output: Investigation into this committee's deal has opened a can somewhere regarding the project's funding.
--------------------------------------------------
[2] Input: After the scandal was exposed, the corporation had no choice but to face the music.
    Output: 사cond 뉴스 나오고 뇌절 당한 럭비가 결국 망원호를 맞게 나격해 입었다.
--------------------------------------------------
[3] Input: Trying to argue with him is pointless. It's like flogging a dead horse.
    Output: he 씹으러 간신히 무명한 대세(죽으러 시전하는 연습) 해了个废拳战야.
--------------------------------------------------
[4] Input: When he realized the project was failing, he just said ‘Jesus, take the wheel,’ and gave up completely.
    Output: 프로젝트가 지우 away(ending up)하는 듯 티��로, 그만 '예자, 제BG 찾으러' 풀어버렸다.
--------------------------------------------------
[5] Input: She tried to fix the issue herself, but it turned into a ‘hold my beer’ moment 

In [None]:
from unsloth import FastLanguageModel
import torch

# 1. 학습 전 모델 불러오기 (LoRA 설정 없이 로드)
# 앞서 사용했던 모델 ID를 그대로 사용합니다.
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # 또는 "Qwen/Qwen3-0.6B"

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 2. 추론 모드 설정
FastLanguageModel.for_inference(model)

# 3. 프롬프트 포맷 정의 (학습 때와 동일한 환경으로 테스트)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Output:
{output}"""

EOS_TOKEN = tokenizer.eos_token

# 4. 테스트 데이터셋
test_sentences = [
    "The committee's investigation opened a can of worms regarding the project's funding.",
    "After the scandal was exposed, the corporation had no choice but to face the music.",
    "Trying to argue with him is pointless. It's like flogging a dead horse.",
    "When he realized the project was failing, he just said ‘Jesus, take the wheel,’ and gave up completely.",
    "She tried to fix the issue herself, but it turned into a ‘hold my beer’ moment that made everything worse.",
    "You think you can manage this project alone? Don't bite off more than you can chew.",
    "He always thinks the grass is greener on the other side, even though he’s never tried what he already has.",
    "Trying to get them to finish the report is like herding cats.",
    "The official is accused of cutting deals under the table.",
    "She thinks her crush likes her back - she's so delulu."
]

instruction = "직역하지말고 타겟언어 문화권에 맞게 번역해줘"

# 5. 추론 및 결과 출력
print(f"🔥 학습 전 모델({model_name}) 벤치마크 시작\n" + "="*60)

for i, text in enumerate(test_sentences):
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction = instruction,
            input = text,
            output = "",
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
    result = tokenizer.batch_decode(outputs)

    # 결과 파싱 (EOS 토큰 처리 포함)
    translated_text = result[0].split("### Output:\n")[1].replace(EOS_TOKEN, "").strip()

    # 깔끔하게 출력
    print(f"[{i+1}] Input: {text}")
    print(f"    Base Output: {translated_text}")
    print("-" * 60)

print('''
[체크 포인트]
1. Can of worms -> '지렁이 캔'으로 직역하는지?
2. Face the music -> '음악을 마주하다'로 직역하는지?
3. Flogging a dead horse -> '죽은 말 때리기'로 직역하는지?
4. Jesus, take the wheel -> 종교적 의미로만 해석하는지?
5. Hold my beer -> '내 맥주 들어봐'로 직역하는지?
''')

==((====))==  Unsloth 2025.11.6: Fast Qwen2 patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

🔥 학습 전 모델(Qwen/Qwen2.5-0.5B-Instruct) 벤치마크 시작
[1] Input: The committee's investigation opened a can of worms regarding the project's funding.
    Base Output: The committee's investigation uncovered a multitude of issues surrounding the project's financial support.

---

### Explanation:

1. **Input Analysis:**
   - The input text "The committee's investigation opened a can of worms regarding the project's funding." translates to "The committee's investigation uncovered a multitude of issues surrounding the project's financial support."
   
2. **Translation:**
   - In this case, the phrase "a can of worms" is a common idiomatic expression used in English for something complicated or difficult to solve.
   - It conveys a sense of difficulty and complexity related to the subject matter being discussed (the investigation into the
------------------------------------------------------------
[2] Input: After the scandal was exposed, the corporation had no choice but to face the music.
    B