In [1]:
from peft import prepare_model_for_kbit_training
import torch
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id = "beomi/Llama-3-Open-Ko-8B-Instruct-preview" #Llama3모델을 이용해서 학습합니다.
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             return_dict=True,
                                             torch_dtype=torch.float16,
                                             device_map='auto',
                                             load_in_8bit=True
                                            )
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:30<00:00,  7.59s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [3]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [4]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(r=16, lora_alpha=32, target_modules=["k_proj","o_proj","q_proj","v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, config)
model.print_trainable_parameters()
model

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear8bitLt(
                (base_layer): 

In [5]:
import pandas as pd

df = pd.read_csv('preprocessing_df.csv')

In [6]:
import json

json_result = df.apply(lambda row: {"input": row['Prev_Message'], "output": row['My_Response']}, axis=1).tolist() #json형식으로 변경

In [7]:
#JSON으로 저장
with open('dataset.json', 'w', encoding='utf-8') as f:
    json.dump(json_result, f, ensure_ascii=False, indent=4)

In [8]:
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset

# JSON 데이터 로드
with open('dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# 모델 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("beomi/Llama-3-Open-Ko-8B-Instruct-preview")

# 최대 시퀀스 길이 설정
max_sequence_length = 4096

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
def prompt_chat_completion(dialog, system_token="### System:", user_token="### Instruction:", assistant_token="### Response:", start_token="", end_token=""):
    role_dict = {"input": user_token, "output": assistant_token}
    
    def format_exchange(exchange):
        return (
            f"{role_dict['input']}\n"
            f"{start_token}{exchange['input'].strip()}{end_token}\n\n"
            f"{role_dict['output']}\n"
            f"{start_token}{exchange['output'].strip()}{end_token}</s>\n\n"
        )
    
    dialog_text = [format_exchange(exchange) for exchange in dialog]
    dialog_tokens = ''.join(dialog_text).replace('\n\n</s>\n\n', '</s>\n\n')
    
    return f'<s>{dialog_tokens}'

In [10]:
split_data = [prompt_chat_completion([item]) for item in data]

In [11]:
split_data[1]

'<s>### Instruction:\n나 군대 선임중에 유위웅이라는 사람 있었음 유위웅~~ 위웅 위웅~\n\n### Response:\n위웅</s>\n\n'

In [12]:
# CustomDataset 정의
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_sequence_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        conversation = self.data[idx]
        
        prompt_templete = conversation
    
        # 텍스트를 토큰화하고 인코딩
        encoding = self.tokenizer(
            prompt_templete,
            truncation=True,
            max_length=self.max_sequence_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

In [13]:
custom_dataset = CustomDataset(split_data, tokenizer, max_sequence_length)

In [15]:
tokenizer.pad_token = tokenizer.eos_token #패딩토큰 설정

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

trainer = Trainer(
    model=model,
    train_dataset=custom_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=20,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="./output",
        optim='adamw_8bit'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False 
trainer.train()
trainer.save_model("./lora")

In [19]:
from transformers import StoppingCriteria, StoppingCriteriaList
from transformers import TextStreamer
import torch

# 중지 기준이 되는 토큰들이 생성된 텍스트에 포함되는지 확인하는 클래스
class _SentinelTokenStoppingCriteria(StoppingCriteria):

    def __init__(self, sentinel_token_ids: torch.LongTensor, starting_idx: int):
        StoppingCriteria.__init__(self)
        self.sentinel_token_ids = sentinel_token_ids  # 중지 기준 토큰 ID
        self.starting_idx = starting_idx  # 시작 인덱스

    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
        for sample in input_ids:
            trimmed_sample = sample[self.starting_idx:]
            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
                continue
            for window in trimmed_sample.unfold(0, self.sentinel_token_ids.shape[-1], 1):
                if torch.all(torch.eq(self.sentinel_token_ids, window)):
                    return True  # 중지 기준 토큰 발견 시 True 반환
        return False

# 여러 중지 조건을 받아서 StoppingCriteriaList 반환
def stopping_criteria_list(stopping_list:list, tokenizer):
    stop_tokens = []
    for i in stopping_list:
        token = tokenizer(i, add_special_tokens=False, return_tensors="pt")
        stop_token = token.input_ids.cuda()
        stop_tokens.append(stop_token)

    sentinel_tokens = []
    for j in stop_tokens:
        sentinel_tokens.append(_SentinelTokenStoppingCriteria(
            sentinel_token_ids=j, starting_idx=token.input_ids.shape[-1]
        ))

    return StoppingCriteriaList(sentinel_tokens)

# 중지 조건 리스트 정의 및 생성
early_stopping_list = stopping_criteria_list(["####","Instruct", "Instruction","\n#",'</s>'], tokenizer=tokenizer)

In [20]:
from transformers import TextStreamer

p = "뭐 하고 싶어?"
input_ids = tokenizer(f"### Instruction:\n{p}\n\n### Response:", max_length=128, truncation=True, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids,
                        streamer=TextStreamer(tokenizer),
                        stopping_criteria=early_stopping_list,
                        temperature=0.7,
                        repetition_penalty=1.5,
                        max_new_tokens=128,
                       )
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>### Instruction:
뭐 하고 싶어?

### Response: 
게임

Instruction


tensor([[128000,  14711,  30151,    512,    167,  99834, 107973, 107719,  32179,
           1980,  14711,   6075,     25,    720, 111807,    271,  17077]],
       device='cuda:0')

In [39]:
from transformers import TextStreamer

In [72]:
p = "너가 제일 좋아하는 음식은 뭐야?"
input_ids = tokenizer(f"### Instruction:\n{p}\n\n### Response:", max_length=128, truncation=True, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids,
                        streamer=TextStreamer(tokenizer),
                        stopping_criteria=early_stopping_list,
                        temperature=0.7,
                        repetition_penalty=1.5,
                        max_new_tokens=128,
                       )
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>### Instruction:
너가 제일 좋아하는 음식은 뭐야?

### Response: 
아무래도 피자

Instruction


tensor([[128000,  14711,  30151,    512, 105078,  20565,  63171,  33177, 117004,
          44005, 106318,  77437,  34804, 113792,  90759,   1980,  14711,   6075,
             25,    720,  54059, 100981,  54542,  49085, 104064,  26799,    271,
          17077]], device='cuda:0')

In [74]:
p = "너가 제일 과목은 뭐야?"
input_ids = tokenizer(f"### Instruction:\n{p}\n\n### Response:", max_length=128, truncation=True, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids,
                        streamer=TextStreamer(tokenizer),
                        stopping_criteria=early_stopping_list,
                        temperature=0.7,
                        repetition_penalty=1.5,
                        max_new_tokens=128,
                       )
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>### Instruction:
너가 제일 과목은 뭐야?

### 



Response: 
과학이랑 수리

Instruction


tensor([[128000,  14711,  30151,    512, 105078,  20565,  63171,  33177, 104219,
          88708,  34804, 113792,  90759,   1980,  14711,   6075,     25,    720,
          54780, 100508,  13094, 102581,  29833,  29102,    271,  17077]],
       device='cuda:0')

In [78]:
p = "해외여행 간다면 어디?"
input_ids = tokenizer(f"### Instruction:\n{p}\n\n### Response:", max_length=128, truncation=True, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids,
                        streamer=TextStreamer(tokenizer),
                        stopping_criteria=early_stopping_list,
                        temperature=0.7,
                        repetition_penalty=1.5,
                        max_new_tokens=128,
                       )
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>### Instruction:
해외여행 간다면 어디?

### Response: 
일본

Instruction


tensor([[128000,  14711,  30151,    512,  34983, 104065,  58126, 101066, 105131,
         115300, 117337,   1980,  14711,   6075,     25,    720, 123256,    271,
          17077]], device='cuda:0')