# Train LoRA

In [1]:
%pip install python-dotenv torch transformers datasets bitsandbytes accelerate peft -qU

Note: you may need to restart the kernel to use updated packages.


In [31]:
# 트레이닝 데이터 로드
from datasets import load_dataset, Dataset
# 50개 샘플을 가져와서, 8:2 비율로 train:test 분할
dataset = load_dataset("imdb", split="train[:50]").train_test_split(test_size=0.2)

In [32]:
raw = dataset['train'][20]
print(f'text: {raw["text"]}')
print(f'label: {raw["label"]}')

text: This film was probably inspired by Godard's Masculin, féminin and I urge you to see that film instead.<br /><br />The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.<br /><br />A movie of its time, and place. 2/10.
label: 0


In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType

# ==== MPS 디바이스 설정 ====
def get_device():
    device = None
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("MPS 디바이스를 사용합니다.")
    else:
        device = torch.device("cpu")
        print("MPS를 사용할 수 없어 CPU를 사용합니다.")
    return device

# ==== 토크나이저 로드 ====
def get_tokenizer(model_path):
    print("🔄 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        use_fast=True,
        padding_side="left",  # 배치 추론 대비 안전
        use_safetensors=True,
    )
    if tokenizer.pad_token is None:
        print("⚠️ pad_token이 없어서 eos_token으로 설정합니다.")
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    tokenizer.padding_side = "left"  
    return tokenizer

def get_model(model_path, dtype, option):
    print("🔄 Loading model...")
    return AutoModelForCausalLM.from_pretrained(
        model_path,
        dtype=dtype,
        low_cpu_mem_usage=True,
        use_safetensors=option["use_safetensors"],
    )

def set_model_to_device(model, device):
    print("🔄 Moving model to device...")
    model.to(device)
    model.eval()
    return model


In [34]:
LOCAL_MODEL_PATH = "../ai_models/gemma-3-270m"
DTYPE = torch.bfloat16
MODEL_OPTION = {"use_safetensors": True}
ADAPTER_FLAG = False
ADAPTER_PATH = ""

device = get_device()
tokenizer = get_tokenizer(LOCAL_MODEL_PATH)
model = get_model(LOCAL_MODEL_PATH, DTYPE, MODEL_OPTION)

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    # target_modules = ["c_attn", "c_proj", "q_attn"], # GPT 계열
    target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
    modules_to_save=['embed_tokens', 'lm_head'],
    lora_dropout = 0.05,
    bias = "none",
    task_type = TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

model.to("mps")
model.eval()

MPS 디바이스를 사용합니다.
🔄 Loading tokenizer...
🔄 Loading model...
trainable params: 339,341,312 || all params: 607,439,488 || trainable%: 55.8642


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
          (modules_to_save): ModuleDict(
            (default): Gemma3TextScaledWordEmbedding(262144, 640, padding_idx=0)
          )
        )
        (layers): ModuleList(
          (0-17): 18 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=640, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=640, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias

In [35]:
#dataset = dataset.map(tokenizer, batched=True)

data = {
    "text": [
        "### 질문: 우리집 강아지 이름은?\n### 답변: 순둥이",
        "### 질문: 오늘 날씨 어때?\n### 답변: 맑음",
        "### 질문: 바다는 왜 파란가요?\n### 답변: 햇빛의 산란",
    ]
}

dataset = Dataset.from_dict(data)

# jsonl 파일을 불러와 dataset 생성
dataset = Dataset.from_json("./ecommerce_data/ecommerce_finetune.jsonl")
print(dataset)

Generating train split: 15 examples [00:00, 2523.65 examples/s]

Dataset({
    features: ['input', 'output'],
    num_rows: 15
})





In [36]:
def tokenize_func(example):
    #return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)
    # dataset에는 "input", "output" 컬럼이 있음
    return tokenizer(example["input"] + example["output"], truncation=True, padding="max_length", max_length=128).to("mps")

tokenized_dataset = dataset.map(tokenize_func)

Map: 100%|██████████| 15/15 [00:00<00:00, 2955.54 examples/s]


In [37]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [38]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    num_train_epochs=50,
    logging_steps=1,
    save_strategy="no",
    fp16=True if device.type == "cuda" else False,
    report_to="none"
)

In [39]:
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    mask = labels != -100
    correct = (predictions == labels) & mask
    accuracy = correct.sum() / mask.sum()

    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Step,Training Loss
1,4.2042
2,3.9765
3,2.3688
4,3.3054
5,4.3481
6,2.8833
7,1.9109
8,2.2987
9,4.1514
10,2.6889


TrainOutput(global_step=750, training_loss=0.29162384978185096, metrics={'train_runtime': 205.1281, 'train_samples_per_second': 3.656, 'train_steps_per_second': 3.656, 'total_flos': 156611616768000.0, 'train_loss': 0.29162384978185096, 'epoch': 50.0})

In [44]:
input_text = "2024년 1월 신규 가입 고객 수는?"
inputs = tokenizer(input_text, return_tensors="pt")
device = model.device
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


2024년 1월 신규 가입 고객 수는?SELECT COUNT(*) FROM users WHERE signup_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1 month') AND signup_date < DATE_TRUNC('month', CURRENT_DATE); LIMIT 20; 】SELECT *
