## 0. Setting

### 0-1. Library 설치

In [6]:
!pip3 install -r requirement.txt

Collecting jsonlines
  Using cached jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Collecting peft
  Using cached peft-0.11.1-py3-none-any.whl (251 kB)
Installing collected packages: jsonlines, peft
Successfully installed jsonlines-4.0.0 peft-0.11.1


### 0-2. Hugging Face 로그인

In [42]:
import huggingface_hub

token = "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"
huggingface_hub.login(token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/ho/.cache/huggingface/token
Login successful


## 1. Prepare the Dataset

### 1-1. 학습 데이터셋 구성 (jsonl 파일로 만들기)

In [26]:
import os
import json
import pandas as pd

def make_jsonl(input_paths, label_paths, jsonl_path):
    assert len(input_paths) == len(label_paths)

    data = []
    for input_path, label_path in zip(input_paths, label_paths):
        with open(input_path, 'r', encoding='utf-8') as f:
            inputs = [line.strip() for line in f]
        
        with open(label_path, 'r', encoding='utf-8') as f:
            labels = [line.strip() for line in f]

        assert len(inputs) == len(labels)

        for input, label in zip(inputs, labels):
            data.append({'input': input, 'label': label})

    with open(jsonl_path, 'w', encoding='utf-8') as f:
        for d in data:
            f.write(json.dumps(d, ensure_ascii=False) + '\n')

root = "./dataset"
index = ["00", "01", "02"]

input_paths = [os.path.join(root, f"output_text_{idx}.txt") for idx in index]
label_paths = [os.path.join(root, f"input_text_{idx}.txt") for idx in index]
jsonl_path = os.path.join(root, "train_data.jsonl")

make_jsonl(input_paths, label_paths, jsonl_path)

### 1-2. Fine-tuning 포맷으로 변환

In [28]:
import jsonlines
import datasets

def make_dataset(jsonl_path):
    dataset = []
    system = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."

    with jsonlines.open(jsonl_path) as f:
        for line in f.iter():
            formatted = f"{system}\nHuman: {line['input']}\nAssistant: {line['label']}"
            dataset.append(formatted)
    
    dataset = datasets.Dataset.from_dict({"text": dataset})
    return dataset

dataset = make_dataset(jsonl_path)
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 6612
})


참고) Hugging Face에 업로드

In [29]:
dataset.push_to_hub("kanghokh/ocr_data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/271 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/kanghokh/ocr_data/commit/8c21dc77efcd60d1a9fb52df20f101ee56f62508', commit_message='Upload dataset', commit_description='', oid='8c21dc77efcd60d1a9fb52df20f101ee56f62508', pr_url=None, pr_revision=None, pr_num=None)

## 2. Fine-tuning LLM

### 2-1. 모델 설정

In [50]:
base_model = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0"

### 2-2. Dataset 설정

참고) HuggingFace로부터 Dataset 불러오기

In [44]:
from datasets import load_dataset

ocr_data = "kanghokh/ocr_data"
dataset = load_dataset(ocr_data, split="train")
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 6612
})


### 2-3. QLoRA Fine-tuning

In [47]:
import torch
from transformers import BitsAndBytesConfig

compute_dtype = getattr(torch, "float16")
qlora_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

### 2-4. Model  불러오기

In [51]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=qlora_config,
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    trust_remote_code=True
)

print(model)
print(tokenizer)

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/35.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(40960, 4096)
    (layers): ModuleList(
      (0-47): 48 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRM

### 2-5. PEFT Parameter

In [52]:
from peft import LoraConfig

peft_param = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CASUAL_LM"
)

### 2-6. Training Parameter

In [53]:
from transformers import TrainingArguments

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

### 2-7. Supervised Fine-tuning (SFT)

In [None]:
import os
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

os.makedirs("./results/", exist_ok=True)
trainer.train()

## 3. Evaluation

In [None]:
from tensorboard import notebook

log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

### 3-1. 텍스트 생성을 위한 Pipeline

In [None]:
from transformers import pipeline

generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

key = "나는 오는 학교를 갓다,"
template = f"""{system}\nHuman: {key}\nAssistant:\n"""

response = generator(template, max_length=200, do_sample=True, pad_token_id=tokenizer.eos_token_id)
print(response[0]['generated_text'].replace(prompt, ""))


### 3-2. 모델 저장

In [None]:
save_path = f"./resutls/"
trainer.save_model(save_path)