In [1]:
!nvidia-smi

Thu Jun 26 15:07:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:A1:00.0 Off |                  Off |
| 30%   26C    P8             11W /  450W |   22941MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!pip install transformers datasets accelerate peft trl bitsandbytes

## Train

In [2]:
import os
import torch

# 메모리 할당 최적화
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# GPU 메모리 정리
torch.cuda.empty_cache()

In [3]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import torch

In [4]:
# 🎯 모델과 데이터 경로 설정
model_name = "Qwen/Qwen2.5-7B-Instruct"
new_model_name = "JudyChoi/qwen2.5-7b-drt"
train_file = "./data/drt_train.jsonl"
val_file = "./data/drt_val.jsonl"

In [5]:
# ✨ 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token   # Qwen은 eos_token 사용

In [6]:
# ❄️ 4bit 양자화된 모델 로딩 (bnb를 이용한 QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # float16 → bfloat16 (더 안정적)
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
# 🛠️ QLoRA 핵심 설정 적용!
model = prepare_model_for_kbit_training(model)

In [9]:
# 🍰 LoRA 구성 추가 (adapter만 학습!)
peft_config = LoraConfig(
    lora_alpha=64,                         # 16 → 64 대폭 증가 (더 강한 학습)
    lora_dropout=0.05,                     # 낮은 드롭아웃
    r=32,                                  # rank 증가 (8 → 32)
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, peft_config)



In [10]:
# 📚 데이터셋 불러오기 (train / validation)
dataset = load_dataset("json", data_files={"train": train_file, "validation": val_file})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'trans', 'thought'],
        num_rows: 1300
    })
    validation: Dataset({
        features: ['text', 'trans', 'thought'],
        num_rows: 338
    })
})

In [11]:
# 💬 Chat 형식 prompt 포맷팅 (Few-shot 예제 추가)
def format_chat(example):
    user_prompt = f"""Please translate the following text from Korean to English:

Here are examples of the expected format:

Example 1:
Korean: 식은 죽 먹기
<thought>'식은 죽 먹기' is a Korean idiom that literally means 'eating cold porridge'. It refers to something that is very easy or simple to do, equivalent to the English idiom 'a piece of cake'. This expression is commonly used in Korean to describe tasks or situations that require minimal effort or skill.</thought>
<output>A piece of cake</output>

Example 2:
Korean: 금강산도 식후경
<thought>This is a Korean proverb meaning that even the most beautiful scenery is better appreciated after having a meal. It emphasizes the importance of taking care of basic needs first before enjoying other pleasures.</thought>
<output>Even the beautiful Diamond Mountain is better after a meal</output>

Now translate this:
Korean: {example['text']}"""
    
    system_prompt = "You are a helpful Korean-English translator who provides detailed analysis before translation."
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": f"<thought>{example['thought']}</thought><output>{example['trans']}</output>"}
    ]
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}

dataset = dataset.map(format_chat)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'trans', 'thought'],
        num_rows: 1300
    })
    validation: Dataset({
        features: ['text', 'trans', 'thought'],
        num_rows: 338
    })
})

In [12]:
training_args = TrainingArguments(
    output_dir="./qwen-sft-qlora",
    # 리소스 제약때문에 batch size를 타협해야하는 경우가 발생 -> micro batch size를 줄이고,
 	# accumulated step을 늘려, 적절한 size로 gradient를 구해 weight update
    # https://www.youtube.com/watch?v=ptlmj9Y9iwE
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,       # 더 큰 effective batch size
    max_steps=150,                        # 1300개 데이터면 충분
    eval_steps=50,
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    warmup_ratio=0.2,                     # 더 긴 워밍업
    learning_rate=1e-4,                   # 높은 학습률로 빠른 수렴
    lr_scheduler_type="cosine",
    eval_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=0.5,                    # 더 작은 gradient clipping
    fp16=True,
    gradient_checkpointing=True,
    dataloader_num_workers=0,
    seed=42,
)

In [13]:
# 💪 SFTTrainer 초기화 (early stopping)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    # max_seq_length=4096,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
# 🔍 GPU 사용 확인
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
print(f"현재 디바이스: {torch.cuda.current_device()}")
print(f"GPU 개수: {torch.cuda.device_count()}")
print(f"GPU 이름: {torch.cuda.get_device_name()}")

# 모델이 GPU에 있는지 확인
print(f"모델 디바이스: {next(model.parameters()).device}")

# GPU 메모리 사용량 확인
if torch.cuda.is_available():
    print(f"GPU 메모리 할당: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    print(f"GPU 메모리 캐시: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


CUDA 사용 가능: True
현재 디바이스: 0
GPU 개수: 1
GPU 이름: NVIDIA GeForce RTX 4090
모델 디바이스: cuda:0
GPU 메모리 할당: 7.50 GB
GPU 메모리 캐시: 9.88 GB


In [15]:
# 🚀 학습 시작!
trainer.train()
# 💾 모델 저장하기
model.save_pretrained("./qwen-sft-qlora")
tokenizer.save_pretrained("./qwen-sft-qlora")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
50,0.5144,0.508808
100,0.4215,0.475848
150,0.4295,0.467872


('./qwen-sft-qlora/tokenizer_config.json',
 './qwen-sft-qlora/special_tokens_map.json',
 './qwen-sft-qlora/chat_template.jinja',
 './qwen-sft-qlora/vocab.json',
 './qwen-sft-qlora/merges.txt',
 './qwen-sft-qlora/added_tokens.json',
 './qwen-sft-qlora/tokenizer.json')

## 말기 전에 테스트 먼저

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import re

# ① 🧸 저장한 모델 경로
model_path = "./qwen-sft-qlora"

# ② 🔓 모델 로드 (Base Model + LoRA Adapter)
# ❄️ 4bit 양자화 설정 (학습 때와 동일하게)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# 베이스 모델을 먼저 로드합니다.
# 'model_name' 변수는 이전 학습 셀에서 정의된 것을 사용합니다.
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# 베이스 모델 위에 어댑터를 로드합니다.
model = PeftModel.from_pretrained(base_model, model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# ③ ✨ 테스트용 메시지 구성 (Chat format)
messages = [
    {"role": "system", "content": "You are a helpful Korean-English translator who provides detailed analysis before translation."},
    {"role": "user", "content": ""}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True  # 🪄 assistant가 이어서 말하는 형식으로 변환!
)

# ④ 🎯 모델 입력값 준비
inputs = tokenizer(text, return_tensors="pt").to(model.device)

# ⑤ 🚀 생성
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

# ⑥ 🔍 입력 제외한 결과만 추출
gen = generated_ids[0][inputs["input_ids"].shape[1]:]
decoded = tokenizer.decode(gen, skip_special_tokens=True)

# ⑦ 🎁 결과 출력 + 파싱
print("📤 모델 응답 전체:\n", decoded)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



📤 모델 응답 전체:
 The phrase "식은 죽 먹기" is a Korean idiom that translates to English as "eating rice that has already been cooked." This expression is used to describe a situation where someone is forced to do something they have already done before, or when someone is forced to repeat a task that they have already completed.

In more detail, the term "식은" (sigeun) means "already" or "previously," and "죽" (juk) refers to "rice." The word "먹기" (meogki) means "to eat." So literally, it suggests eating rice that has already been prepared. 

This idiom conveys the idea of unnecessary repetition or doing something that has already been done, often with a hint of frustration or resignation. For example, if a manager assigns a task to an employee that the employee has already finished, you might say, "It's like eating rice that has already been cooked," implying that the work was redundant.

Here is the translation:
"It's like eating rice that has already been cooked."


In [19]:
from peft import PeftModel

# ① 베이스 모델 로드
base_model_name = "Qwen/Qwen2.5-7B-Instruct"  # 원래 사용한 모델명
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ② 어댑터 로드
peft_model = PeftModel.from_pretrained(
    base_model, 
    "./qwen-sft-qlora"
)

# ③ 어댑터를 베이스 모델에 병합
merged_model = peft_model.merge_and_unload()

# ④ 병합된 모델 저장
merged_model.save_pretrained("./qwen-sft-merged")
tokenizer.save_pretrained("./qwen-sft-merged")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Saving checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

('./qwen-sft-merged/tokenizer_config.json',
 './qwen-sft-merged/special_tokens_map.json',
 './qwen-sft-merged/chat_template.jinja',
 './qwen-sft-merged/vocab.json',
 './qwen-sft-merged/merges.txt',
 './qwen-sft-merged/added_tokens.json',
 './qwen-sft-merged/tokenizer.json')

## Test

In [20]:
print(model.device)

cuda:0


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# ✅ 병합된 모델 로컬 경로에서 로드
model = AutoModelForCausalLM.from_pretrained(
    "./qwen-sft-merged",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# ✅ 토크나이저는 base 모델에서 가져오기
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    trust_remote_code=True
)

# ✨ 테스트용 프롬프트
messages = [
    {"role": "system", "content": "You are a philosopher skilled in deep thinking."},
    {"role": "user", "content": "Please translate the following text from Korean to English:\n식은 죽 먹기"}
]

# 🧠 Qwen 스타일 prompt 생성
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# 💫 Tokenize & GPU에 넣기
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 🚀 Inference (빠르게, 확실하게)
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,  # greedy
        pad_token_id=tokenizer.eos_token_id,
        use_cache=True
    )

# 📤 디코딩
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("\n🧠 모델 출력:\n", output)



## Deploy

In [None]:
!pip install -U huggingface_hub

In [24]:
import os
os.environ["HF_TOKEN"] = "hf_"

In [None]:
# 어댑터만 올리기
# from huggingface_hub import create_repo, upload_folder

# repo_id = "JudyChoi/qwen2.5-7b-drt"
# local_dir = "./qwen-sft-qlora"

# # (이미 repo가 존재하면 exist_ok=True)
# create_repo(repo_id, exist_ok=True)

# upload_folder(
#     folder_path=local_dir,
#     repo_id=repo_id,
#     repo_type="model",
#     path_in_repo=".",
#     commit_message="Initial QLoRA adapter upload",
# )


In [None]:
# ① 베이스 모델 로드
base_model_name = "Qwen/Qwen2.5-7B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# ② 어댑터 로드
peft_model = PeftModel.from_pretrained(
    base_model, 
    "./qwen-sft-qlora"
)

# ③ 어댑터를 베이스 모델에 병합
merged_model = peft_model.merge_and_unload()

# ④ 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# ⑤ 허깅페이스에 직접 업로드
repo_name = "JudyChoi/qwen2.5-7b-drt"  # 원하는 레포지토리 이름

merged_model.push_to_hub(
    repo_name,
    commit_message="Upload Korean-English translator model with QLora fine-tuning"
)

tokenizer.push_to_hub(
    repo_name,
    commit_message="Upload tokenizer for Korean-English translator"
)