# 강의 11주차: midm-food-order-understanding

1. KT-AI/midm-bitext-S-7B-inst-v1 를 주문 문장 이해에 미세 튜닝

- food-order-understanding-small-3200.json (학습)
- food-order-understanding-small-800.json (검증)


종속적인 필요 내용
- huggingface 계정 설정 및 llama-2 사용 승인
- 로깅을 위한 wandb


history

v1.2
- KT-AI/midm-bitext-S-7B-inst-v1 에 safetensors 포맷이 올라왔기에, 해당 리포에서 받도록 설정 변경
- 전체 과정 재검증

In [1]:
pip install transformers peft accelerate optimum bitsandbytes trl wandb einops

Collecting peft
  Downloading peft-0.7.0-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum
  Downloading optimum-1.15.0-py3-none-any.whl (400 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.3.post1-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.7.4-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m16.5 MB/s[0m eta [36m

In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import re

import torch
import tyro
from accelerate import Accelerator
from datasets import load_dataset, Dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

from trl import SFTTrainer

from trl.trainer import ConstantLengthDataset



In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

드라이브 마운트 후 파일 업로드
- food-order-understanding-small-3200.json
- food-order-understanding-small-800.json

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# 매개 변수 설정

In [5]:
@dataclass
class ScriptArguments:
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "the cache dir"}
    )
    model_name: Optional[str] = field(
        default="meta-llama/Llama-2-7b-chat-hf", metadata={"help": "the model name"}
    )

    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "the dataset name"},
    )
    seq_length: Optional[int] = field(
        default=1024, metadata={"help": "the sequence length"}
    )
    num_workers: Optional[int] = field(
        default=8, metadata={"help": "the number of workers"}
    )
    training_args: TrainingArguments = field(
        default_factory=lambda: TrainingArguments(
            output_dir="./results",
            # max_steps=500,
            logging_steps=20,
            # save_steps=10,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            gradient_checkpointing=False,
            group_by_length=False,
            learning_rate=1e-4,
            lr_scheduler_type="cosine",
            # warmup_steps=100,
            warmup_ratio=0.03,
            max_grad_norm=0.3,
            weight_decay=0.05,
            save_total_limit=20,
            save_strategy="epoch",
            num_train_epochs=1,
            optim="paged_adamw_32bit",
            fp16=True,
            remove_unused_columns=False,
            report_to="wandb",
            push_to_hub=True,
        )
    )

    packing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use packing for SFTTrainer"}
    )

    peft_config: LoraConfig = field(
        default_factory=lambda: LoraConfig(
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["c_attn", "c_proj", "c_fc"],
            bias="none",
            task_type="CAUSAL_LM",
        )
    )

    merge_with_final_checkpoint: Optional[bool] = field(
        default=False, metadata={"help": "Do only merge with final checkpoint"}
    )

# 유틸리티

In [6]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# 데이터 로딩

In [7]:
def prepare_sample_text(example):
    """Prepare the text from a sample of the dataset."""

    prompt_template = """###System;{System}
    ###User;{User}
    ###Midm;{Midm}"""

    default_system_msg = (
        "너는 사용자가 작성한 리뷰의 긍정 또는 부정을 판단해야 한다."
    )

    text = (
        prompt_template.format(System=default_system_msg,
                               User=example["document"],
                               Midm="긍정" if example["label"]==1 else "부정")
    )

    return text

In [8]:
!pip install transformers



In [9]:
from datasets import load_dataset

def create_datasets(tokenizer, args):
    dataset = load_dataset('nsmc')
    train_data = dataset['train'].select(range(3000))
    valid_data = dataset['test'].select(range(1000))

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=prepare_sample_text,
        infinite=False,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset

# 미세 튜닝용 모델 로딩

In [10]:
script_args = ScriptArguments(
    num_workers=2,
    seq_length=384,
    dataset_name='/gdrive/MyDrive/NLP/food-order-understanding-small-3200.json',
    model_name='KT-AI/midm-bitext-S-7B-inst-v1',
    # model_name='jangmin/midm-7b-safetensors-only',
    )

In [11]:
script_args.training_args.logging_steps = 50
script_args.training_args.max_steps = 300
script_args.training_args.output_dir = '/gdrive/MyDrive/NLP/lora-midm-7b-food-order-understanding'
script_args.training_args.run_name = 'midm-7b-food-order-understanding'

In [12]:
print(script_args)

ScriptArguments(cache_dir=None, model_name='KT-AI/midm-bitext-S-7B-inst-v1', dataset_name='/gdrive/MyDrive/NLP/food-order-understanding-small-3200.json', seq_length=384, num_workers=2, training_args=TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=N

In [13]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

원본인 'KT-AI/midm-bitext-S-7B-inst-v1' 는 *.bin 형태로 모델을 제공한다.
- 코랩에서 CPU 메모리 부족 발생

해결책
- safetensors로 변환한 모델을 업로드 하고 이를 사용하기로 한다.

In [14]:
base_model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    quantization_config=bnb_config,
    device_map="auto",  # {"": Accelerator().local_process_index},
    trust_remote_code=True,
    use_auth_token=True,
    cache_dir=script_args.cache_dir,
)
base_model.config.use_cache = False



config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

configuration_midm.py:   0%|          | 0.00/831 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1:
- configuration_midm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_midm.py:   0%|          | 0.00/61.1k [00:00<?, ?B/s]

rotary_position_embedding.py:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1:
- rotary_position_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1:
- modeling_midm.py
- rotary_position_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/6.22G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [15]:
base_model

MidmLMHeadModel(
  (transformer): MidmModel(
    (wte): Embedding(72192, 4096)
    (rotary_pos_emb): RotaryEmbedding()
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-31): 32 x MidmBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): MidmAttention(
          (c_attn): Linear4bit(in_features=4096, out_features=12288, bias=False)
          (c_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (mlp): MidmMLP(
          (c_fc): Linear4bit(in_features=4096, out_features=21760, bias=False)
          (c_proj): Linear4bit(in_features=10880, out_features=4096, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
 

In [16]:
peft_config = script_args.peft_config

In [17]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules={'c_fc', 'c_attn', 'c_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [18]:
tokenizer = AutoTokenizer.from_pretrained(
    script_args.model_name,
    trust_remote_code=True,
    cache_dir=script_args.cache_dir,
)

if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

tokenizer.add_special_tokens(dict(bos_token='<s>'))

base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.config.bos_token_id = tokenizer.bos_token_id

tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

midm_bitext_tokenization.py:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1:
- midm_bitext_tokenization.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


midm_bitext_tokenizer.model:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

In [19]:
training_args = script_args.training_args

In [20]:
train_dataset, valid_dataset = create_datasets(tokenizer, script_args)

Downloading builder script:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

100%|██████████| 400/400 [00:00<00:00, 5009.81it/s]

The character to token ratio of the dataset is: 1.57





In [21]:
len(train_dataset)

3000

In [22]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=None,
    peft_config=peft_config,
    packing=script_args.packing,
    max_seq_length=script_args.seq_length,
    tokenizer=tokenizer,
    args=training_args,
)



In [23]:
base_model

MidmLMHeadModel(
  (transformer): MidmModel(
    (wte): Embedding(72192, 4096)
    (rotary_pos_emb): RotaryEmbedding()
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-31): 32 x MidmBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): MidmAttention(
          (c_attn): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=12288, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=12288, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (c_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096,

In [24]:
trainer.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MidmLMHeadModel(
      (transformer): MidmModel(
        (wte): Embedding(72192, 4096)
        (rotary_pos_emb): RotaryEmbedding()
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-31): 32 x MidmBlock(
            (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): MidmAttention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=12288, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=12288, bias=False)
                )
                (lora_embedding_A): ParameterDict()
         

In [25]:
print_trainable_parameters(base_model)

trainable params: 16744448 || all params: 3821510656 || trainable%: 0.4381630592527648


In [26]:
base_model.get_memory_footprint()

7795015808

In [27]:
trainer.model.print_trainable_parameters()

trainable params: 16,744,448 || all params: 7,034,347,520 || trainable%: 0.23803839591934178


midm 모델을 주문 문장 이해에 적용시 특징
- 모델 로딩 과정에서 CPU도 5.1기가, 디스크 42.4기가, GPU 메모리: 7,4 기가

구글 코랩 T-4 GPU: 300스텝 (13:47초 예상)

시퀀스 길이 384의 경우
- 14.7 G / 15.0 G 사용
- 메모리 오버플로우 발생시 이보다 줄일 것

In [28]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
50,1.5349
100,1.0464
150,1.0396
200,1.0598
250,1.0513
300,1.012


TrainOutput(global_step=300, training_loss=1.124000612894694, metrics={'train_runtime': 888.0308, 'train_samples_per_second': 0.676, 'train_steps_per_second': 0.338, 'total_flos': 9315508499251200.0, 'train_loss': 1.124000612894694, 'epoch': 0.2})

In [29]:
script_args.training_args.output_dir

'/gdrive/MyDrive/NLP/lora-midm-7b-food-order-understanding'

In [30]:
trainer.save_model(script_args.training_args.output_dir)

# 추론 테스트

In [31]:
from transformers import pipeline, TextStreamer

In [32]:
instruction_prompt_template = """###System;{System}"""

prompt_template = """###System;{System}
###User;{User}
###Midm;"""

default_system_msg = (
    "너는 사용자가 작성한 리뷰의 긍정 또는 부정을 판단해야 한다."
)

In [33]:
evaluation_queries = [
    "오늘은 비가오니깐 이거 먹자. 삼선짬뽕 곱배기 하나하구요, 사천 탕수육 중짜 한그릇 주세요.",
    "아이스아메리카노 톨사이즈 한잔 하고요. 딸기스무디 한잔 주세요. 또, 콜드브루라떼 하나요.",
    "참이슬 한병, 코카콜라 1.5리터 한병, 테슬라 한병이요.",
    "꼬막무침 1인분하고요, 닭도리탕 중자 주세요. 그리고 소주도 한병 주세요.",
    "김치찌개 3인분하고요, 계란말이 주세요.",
    "불고기버거세트 1개하고요 감자튀김 추가해주세요.",
    "불닭볶음면 1개랑 사리곰탕면 2개 주세요.",
    "카페라떼 아이스 샷추가 한잔하구요. 스콘 하나 주세요",
    "여기요 춘천닭갈비 4인분하고요. 라면사리 추가하겠습니다. 콜라 300ml 두캔주세요.",
    "있잖아요 조랭이떡국 3인분하고요. 떡만두 한세트 주세요.",
    "깐풍탕수 2인분 하고요 콜라 1.5리터 한병이요.",
]

In [34]:
def wrapper_generate(model, input_prompt, do_stream=False):
    data = tokenizer(input_prompt, return_tensors="pt")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    input_ids = data.input_ids[..., :-1]
    with torch.no_grad():
        pred = model.generate(
            input_ids=input_ids.cuda(),
            streamer=streamer if do_stream else None,
            use_cache=True,
            max_new_tokens=float('inf'),
            do_sample=False
        )
    decoded_text = tokenizer.batch_decode(pred, skip_special_tokens=True)
    decoded_text = decoded_text[0].replace("<[!newline]>", "\n")
    return (decoded_text[len(input_prompt):])

In [35]:
eval_dic = {i:wrapper_generate(model=base_model, input_prompt=prompt_template.format(System=default_system_msg, User=evaluation_queries[i]))for i, query in enumerate(evaluation_queries)}



In [36]:
print(eval_dic[0])

긍정


# 미세튜닝된 모델 로딩 후 테스트

In [None]:
# 런타임 재시작
import os
os.kill(os.getpid(), 9)

In [1]:
pip install transformers peft accelerate optimum bitsandbytes trl wandb einops



In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional
import re

import torch
import tyro
from accelerate import Accelerator
from datasets import load_dataset, Dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)

from trl import SFTTrainer



In [3]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [4]:
@dataclass
class ScriptArguments:
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "the cache dir"}
    )
    model_name: Optional[str] = field(
        default="meta-llama/Llama-2-7b-chat-hf", metadata={"help": "the model name"}
    )

    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "the dataset name"},
    )
    seq_length: Optional[int] = field(
        default=1024, metadata={"help": "the sequence length"}
    )
    num_workers: Optional[int] = field(
        default=8, metadata={"help": "the number of workers"}
    )
    training_args: TrainingArguments = field(
        default_factory=lambda: TrainingArguments(
            output_dir="./results",
            # max_steps=500,
            logging_steps=20,
            # save_steps=10,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=2,
            gradient_checkpointing=False,
            group_by_length=False,
            learning_rate=1e-4,
            lr_scheduler_type="cosine",
            # warmup_steps=100,
            warmup_ratio=0.03,
            max_grad_norm=0.3,
            weight_decay=0.05,
            save_total_limit=20,
            save_strategy="epoch",
            num_train_epochs=1,
            optim="paged_adamw_32bit",
            fp16=True,
            remove_unused_columns=False,
            report_to="wandb",
        )
    )

    packing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use packing for SFTTrainer"}
    )

    peft_config: LoraConfig = field(
        default_factory=lambda: LoraConfig(
            r=8,
            lora_alpha=16,
            lora_dropout=0.05,
            target_modules=["c_attn", "c_proj", "c_fc"],
            bias="none",
            task_type="CAUSAL_LM",
        )
    )

    merge_with_final_checkpoint: Optional[bool] = field(
        default=False, metadata={"help": "Do only merge with final checkpoint"}
    )

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
script_args = ScriptArguments(
    num_workers=2,
    seq_length=384,
    dataset_name='/gdrive/MyDrive/NLP/food-order-understanding-small-3200.json',
    model_name='KT-AI/midm-bitext-S-7B-inst-v1',
    # model_name='jangmin/midm-7b-safetensors-only',
    )

In [7]:
script_args.training_args.logging_steps = 50
script_args.training_args.max_steps = 300
script_args.training_args.output_dir = '/gdrive/MyDrive/NLP/lora-midm-7b-food-order-understanding'
script_args.training_args.run_name = 'midm-7b-food-order-understanding'

In [8]:
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    script_args.training_args.output_dir,
    quantization_config=bnb_config,
    device_map="auto",
    cache_dir=script_args.cache_dir,
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    script_args.model_name,
    trust_remote_code=True,
    cache_dir=script_args.cache_dir,
)

if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

tokenizer.add_special_tokens(dict(bos_token='<s>'))

trained_model.config.pad_token_id = tokenizer.pad_token_id
trained_model.config.bos_token_id = tokenizer.bos_token_id

In [10]:
from transformers import pipeline, TextStreamer

In [11]:
instruction_prompt_template = """###System;다음은 매장에서 고객이 음식을 주문하는 주문 문장이다. 이를 분석하여 음식명, 옵션명, 수량을 추출하여 고객의 의도를 이해하고자 한다.
분석 결과를 완성해주기 바란다.

### 주문 문장: {0} ### 분석 결과:
"""

prompt_template = """###System;{System}
###User;{User}
###Midm;"""

default_system_msg = (
   "너는 사용자가 작성한 리뷰의 긍정 또는 부정을 판단해야 한다.")

In [12]:
from datasets import load_dataset

test_dataset = load_dataset('nsmc')['test'].select(range(1000))

In [13]:
def wrapper_generate(model, input_prompt, do_stream=False):
    data = tokenizer(input_prompt, return_tensors="pt")
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    input_ids = data.input_ids[..., :-1]
    with torch.no_grad():
        pred = model.generate(
            input_ids=input_ids.cuda(),
            streamer=streamer if do_stream else None,
            use_cache=True,
            max_new_tokens=float('inf'),
            do_sample=False
        )
    decoded_text = tokenizer.batch_decode(pred, skip_special_tokens=True)
    decoded_text = decoded_text[0].replace("<[!newline]>", "\n")
    return (decoded_text[len(input_prompt):])

추론 과정에서는 GPU 메모리를 약 5.5 GB 활용

In [20]:
count_label_1 = sum(1 for sample in test_dataset if sample['label'] == 1)

count_label_0 = sum(1 for sample in test_dataset if sample['label'] == 0)

print("정답 긍정의 개수:", count_label_1)
print("정답 부정의 개수:", count_label_0)

정답 긍정의 개수: 508
정답 부정의 개수: 492


In [17]:
correct_predictions = 0
predict_0 = 0
predict_1 = 0
TP = 0
TN = 0

total_samples = len(test_dataset)

for i, query in enumerate(test_dataset):
    prediction = {i:wrapper_generate(model=trained_model, do_stream=True, input_prompt=prompt_template.format(System=default_system_msg, User=test_dataset[i]))}
    true_label = "부정" if test_dataset["label"][i] == 0 else "긍정"

    if prediction[i] == "긍정":
      predict_1 += 1
    if prediction[i] == "부정":
      predict_0 += 1

    if prediction[i] == true_label:
      correct_predictions += 1
      if prediction[i] == "긍정":
        TP += 1
      if prediction[i] == "부정":
        TN += 1

accuracy = correct_predictions / total_samples

긍정
긍정
부정
부정
부정
긍정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
부정
부정
긍정
부정
긍정
부정
긍정
부정
긍정
긍정
부정
긍정
긍정
긍정
부정
긍정
긍정
긍정
부정
긍정
부정
부정
부정
긍정
부정
긍정
부정
부정
부정
긍정
부정
긍정
긍정
부정
긍정
긍정
부정
부정
부정
긍정
긍정
긍정
부정
긍정
긍정
긍정
부정
부정
부정
부정
긍정
부정
긍정
긍정
긍정
긍정
부정
부정
부정
부정
부정
긍정
긍정
긍정
긍정
부정
부정
긍정
긍정
긍정
부정
긍정
긍정
부정
긍정
부정
부정
긍정
부정
긍정
부정
긍정
긍정
긍정
부정
긍정
긍정
부정
긍정
긍정
긍정
부정
긍정
부정
긍정
긍정
부정
긍정
긍정
긍정
긍정
부정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
부정
긍정
부정
긍정
부정
긍정
긍정
긍정
부정
긍정
긍정
부정
긍정
긍정
긍정
긍정
부정
부정
긍정
긍정
부정
긍정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
긍정
긍정
긍정
부정
긍정
부정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
부정
부정
부정
긍정
긍정
긍정
부정
긍정
부정
부정
긍정
긍정
긍정
부정
부정
긍정
긍정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
긍정
부정
긍정
긍정
부정
부정
긍정
긍정
긍정
긍정
긍정
긍정
긍정
부정
부정
긍정
긍정
긍정
부정
긍정
부정
긍정
긍정
긍정
긍정
부정
긍정
긍정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
부정
부정
긍정
긍정
긍정
부정
부정
긍정
부정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
긍정
긍정
부정
부정
부정
긍정
부정
부정
긍정
긍정
긍정
긍정
긍정
부정
긍정
긍정
긍정
긍정
긍정
긍정
부정
긍정
긍정
긍정
부정
부정
긍정
긍정
긍정
부정
긍정
부정
긍정
부정
긍정
긍정
부정
부정
부정
긍정
긍정
부정
긍정
긍정
긍정
긍정
부정
긍정
부정
긍정
부정
긍정
긍정
긍정
긍정
긍정
부정
부정
긍정
부정
긍정
긍정
긍정
부정
긍정
긍정
긍정
부정
부정
부정
부정
부정
긍정
부정
부정
긍정
긍정
부정
긍정
부정
긍정
긍정
부정
긍정
긍정
부정
긍

In [18]:
print(f"정확도: {accuracy * 100:.2f}%")

정확도: 89.00%


In [19]:
print(f"TP: {TP}")
print(f"TN: {TN}")

TP: 474
TN: 416


# Midm
***학습 3000번 스텝시 테스트 결과***

</br>

**정확도: 89.00%**

</br>

**혼동행렬(Confusion Matrix)**

||정답 Positive|정답 Negative|
|:------:|:------:|:------:|
|예측 Positive|474|76|
|예측 Negative|34|416|

</br>


**평가지표**

||||
|:------:|:------:|:------:|
|정밀도(Precision)|0.862|
|재현율(Recall)|0.933|
|F1 Score|0.927|

</br>


<성능 향상>
</br>
train 데이터 수와 TrainingArguments의 logging_steps와 max_steps 파라미터를 조절해가며 성능을 높이고자 하였다.

</br>


허깅페이스 업로드된 ID: ChloeKa/lora-midm-7b-food-order-understanding