- huggingface에서 제공하는 llama adapter를 활용해 finetuning 하는 코드

---

## Import

In [None]:
import torch
print(torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!pip install peft
!pip install bitsandbytes
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    GPT2LMHeadModel,
    PreTrainedTokenizerFast,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    AutoTokenizer
)
from peft import (
    get_peft_model,
    LoraConfig,
    AdaptionPromptConfig
)
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm

In [None]:
model_name = "beomi/llama-2-ko-7b"

bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=False,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype="float16",
            )
# lora
# config = LoraConfig(
#     lora_alpha=8,
#     lora_dropout=0.1,
#     r=4,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

config = AdaptionPromptConfig(
    adapter_len=10,
    adapter_layers=30,
    task_type="CAUSAL_LM",
)


model = AutoModelForCausalLM.from_pretrained(
    model_name, quantization_config=bnb_config
)
model.config.use_cache = False
model.config.pretraining_tp = 1
model.enable_input_require_grads()
model = get_peft_model(model, config)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name,eos_token="</s>",pad_token="</s>")

In [None]:
print(tokenizer.convert_ids_to_tokens(0))
print(tokenizer.convert_ids_to_tokens(1))
print(tokenizer.convert_ids_to_tokens(2))
print(tokenizer.convert_ids_to_tokens(3))
print(tokenizer.convert_ids_to_tokens(4))

print(tokenizer.pad_token_id)

In [None]:
# print(tokenizer.pad_token)
# if tokenizer.pad_token is None:
#     print("2")
#     tokenizer.pad_token = "[PAD]"
#     tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# print(tokenizer.pad_token)
# print(tokenizer.pad_token_id)
# print(tokenizer.eos_token_id)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Data Preprocessing

In [None]:
# 데이터 로드
data = pd.read_csv('/content/drive/MyDrive/hansol_train.csv')
display(data.head(2))

# 데이터 포맷팅 및 토크나이징
formatted_data = []
cnt = 1
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

In [None]:
print(input_text)
print(input_ids)

## Model Fine-tuning

In [None]:
# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=0.001)
model.train()
epoch = 3
# 모델 학습
for epoch in range(epoch):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU단으로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{epoch}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./temp_model")
tokenizer.save_pretrained("./temp_tokenizer")

## Model Inference

In [None]:
# 저장된 Fine-tuned 모델과 토크나이저 불러오기
model_dir = "/content/temp_model"
tokenizer_dir = "/content/temp_tokenizer"
model = AutoModelForCausalLM.from_pretrained(model_dir)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

# test.csv의 '질문'에 대한 '답변'을 저장할 리스트
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(train['질문_1']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)