모델의 전체 파라미터를 그대로 학습하는 대신에 대부분의 파라미터는 얼리고 일부의 파라미터만 사용함으로써 리소스를 절약

라지 랭기지 모델을 fully traing할때 발생하는 catastrophic forgetting(새로운 task를 풀기위해 학습을 더 시키면 이전에 학습한것들 일부를 잊어버리는문제) 를 극복할 수 있다.

In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
from transformers import LlamaForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0612')
model = LlamaForCausalLM.from_pretrained('daily_llama_0612')

model
# 4개의 라마 어텐션 디코더 레이어가 있고, mlp layer가 3개가 있다.

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50257, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): L

# Efficient Fine Tuning : PEFT(PrintTrainableParameter)
PEFT라는 패키지를 통해 실행할 수 있다. 여기서 Get PFT모델과 LoraConfig Task Type들을 import 해준다.

In [3]:
from peft import get_peft_model, LoraConfig, TaskType

In [4]:
list(TaskType)

[<TaskType.SEQ_CLS: 'SEQ_CLS'>,
 <TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
 <TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
 <TaskType.TOKEN_CLS: 'TOKEN_CLS'>,
 <TaskType.QUESTION_ANS: 'QUESTION_ANS'>,
 <TaskType.FEATURE_EXTRACTION: 'FEATURE_EXTRACTION'>]

In [5]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False, #학습을 시킬것이기 때문에 False
                        r=32, #r이 작을수록 trainable한 파라미터 수가 적어지고, 클수록 trainable한 파라미터가 커진다.
                        lora_alpha=32, #sacling factor
                        lora_dropout=0.1
                        )

model = get_peft_model(model, peft_config)
model.to(device)
# PeftModelForCausalLM으로바뀌었다

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(50257, 512, padding_idx=0)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=512, out_features=512, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=512, out_features=512, bias=False)
 

In [6]:
# PEFT METHOD를 통해서 학습 가능한 파라미터가 몇개인지 확인
model.print_trainable_parameters() 
#6만개를 학습하고 나머지는 얼려놓는다. r의 크기가 크다면, trainable한 parameter가 커질컷이다.

trainable params: 262,144 || all params: 64,378,368 || trainable%: 0.40719267689420147


In [7]:
from datasets import load_dataset

dataset_cate = load_dataset('heegyu/news-category-balanced-top10')
dataset_cate

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
        num_rows: 83878
    })
})

In [8]:
# dataset label 정제
categories = dataset_cate['train'].to_pandas().category.unique().tolist()
categories.sort()
categories = categories[:4]
categories

['BUSINESS', 'ENTERTAINMENT', 'FOOD & DRINK', 'HEALTHY LIVING']

In [9]:
dataset_cate = dataset_cate.filter(lambda element: element['category'] in categories)
dataset_cate

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
        num_rows: 29026
    })
})

In [10]:
# label과 integer사이를 mapping하는 딕셔너리
categories = [x.split(' ')[0].lower() for x in categories] # 첫번째 단어만 소문자로
int2label_cate = {i: categories[i] for i in range(len(categories))}
label2int_cate = {int2label_cate[key]:key for key in int2label_cate}
print(categories)
print(int2label_cate)
print(label2int_cate)

['business', 'entertainment', 'food', 'healthy']
{0: 'business', 1: 'entertainment', 2: 'food', 3: 'healthy'}
{'business': 0, 'entertainment': 1, 'food': 2, 'healthy': 3}


In [12]:
def gen_label(element):
    category = element['category'].split(' ')[0].lower()
    return {"label": label2int_cate[category], 'category': category}

dataset_cate = dataset_cate.map(gen_label)
dataset_cate = dataset_cate['train'].train_test_split(test_size=0.1)
dataset_cate

Map:   0%|          | 0/29026 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 26123
    })
    test: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 2903
    })
})

In [13]:
# prompt - article이 주어졌을때 category를 분류
from datasets import DatasetDict
from datasets import concatenate_datasets
import random

prompt_format1_cate = """Given the article, what is the topic of the article? article: %s  answer: %s"""
prompt_format2_cate = """Determine the topic of the news article. article: %s answer: %s"""
prompt_format3_cate = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer: %s"""

prompts_cate = [prompt_format1_cate, prompt_format2_cate, prompt_format3_cate]

def gen_prompt_cate(element):
    prompt_format = prompts_cate[random.randint(0, len(prompts_cate)-1)] #prompt의 분배는 랜덤하게
    return DatasetDict({'input': prompt_format%(element['headline'], int2label_cate[element['label']])}) #headline과 label로 채워준다.

train_cate = dataset_cate['train'].map(gen_prompt_cate, remove_columns=dataset_cate['train'].column_names)
train_dataset = train_cate

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

In [14]:
#tokenize
def tokenize(element):
    tokenizer.pad_token = tokenizer.eos_token
    outputs = tokenizer(
        element['input'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
        padding=True
    )
    return {"input_ids": outputs['input_ids']}

context_length=128
tokenized_datasets = train_dataset.map(
    tokenize, batched=True, remove_columns=train_dataset.column_names
)
tokenized_datasets

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 26123
})

In [21]:
#Data_Collator
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key}.shape : {out[key].shape}")

input_ids.shape : torch.Size([5, 49])
attention_mask.shape : torch.Size([5, 49])
labels.shape : torch.Size([5, 49])


In [22]:
#Training Argument, Trainer
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='peft_llama',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='steps',
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type='cosine',
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss


TrainOutput(global_step=816, training_loss=3.8046477074716605, metrics={'train_runtime': 176.5333, 'train_samples_per_second': 147.978, 'train_steps_per_second': 4.622, 'total_flos': 353064342478848.0, 'train_loss': 3.8046477074716605, 'epoch': 1.0})

In [25]:
tokenizer = AutoTokenizer.from_pretrained("daily_tokenizer_0612", padding_side='left')
prompt_format1 = """Given the article, what is the topic of the article? article: %s  answer:"""
prompt_format2 = """Determine the topic of the news article. article: %s answer:"""
prompt_format3 = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer:"""

prompts = [prompt_format1, prompt_format2, prompt_format3]

def gen_valid_prompt_cate(element):
    prompt_format = prompts[random.randint(0, len(prompts)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'])})

valid_dataset = dataset_cate['test'].map(gen_valid_prompt_cate)

context_length=128
valid_dataset = valid_dataset.map(
    tokenize, batched=True, remove_columns=['link', 'headline', 'category', 'short_description', 'authors', 'date', 'input']
)
valid_dataset

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 2903
})

In [26]:
from torch.utils.data import DataLoader

batch_size=4
val_ds = valid_dataset.select(range(100))
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)


In [27]:
from tqdm import tqdm
import re
import torch

def acc(pred, label):
    return torch.sum(torch.tensor(pred) == label.squeeze()).item()

In [29]:
model.eval()
model.to(device)

val_losses = []
val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']
    input_id = batch['input_ids'].to(device)

    pred = model.generate(input_ids=input_id, max_length=128) #input_ids가 다르다.
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False) #자연어 바꾸기
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred] # integer label로 변환

    val_acc += acc(decoded_pred, label)

print("vall acc: ", val_acc/len(val_dl.dataset))

100%|██████████| 25/25 [00:19<00:00,  1.31it/s]

vall acc:  0.61





In [30]:
# 일부의 작은 model을 쓰기 때문에 accuracy는 떨어진다.
tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)

["Given the article, what is the topic of the article? article: Holy Mother Of All That Is Good, 'Curb Your Enthusiasm' Is Officially Back  answer: entertainment  answer: entertainment  answer: entertainment  answer: entertainment  answer: entertainment  answer: entertainment  answer: entertainment, It Is A Child  answer: entertainment (VIDEO) answer: food/ity) answer: food, It Is A True answer: food answer: food answer: food answer: food answer: food answer: food, food, food",
 "Determine the topic of the news article. article: Chevron Chutzpah Knows No Bounds In Statements About Ecuadorians' Contamination Case answer: food answer: food answer: food answer: food answer: food answer: food answer: food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food food answer: food",
 'Determine the topic of the news articl

In [31]:
model.save_pretrained('peft_llama_adapter__')

### 저장된 모델의 size 확인

In [34]:
import os
os.stat('peft_llama_adapter__/adapter_model.safetensors').st_size/(1024*1024)

1.0020065307617188

In [35]:
os.stat('daily_llama_0612/model.safetensors').st_size/(1024*1024)

244.58809661865234

모델의 size가 244정도 줄었다

In [42]:
os.stat('fake_detect_llama/model.safetensors').st_size/(1024*1024)

244.58809661865234

In [43]:
    os.stat('llama_combined_0618/model.safetensors').st_size/(1024*1024)

244.58809661865234

In [39]:
from peft import PeftModel
from transformers import LlamaForCausalLM

model_load = LlamaForCausalLM.from_pretrained('daily_llama_0612') #base model
model_load = PeftModel.from_pretrained(model_load, 'peft_llama_adapter__')
model_load

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(50257, 512, padding_idx=0)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=512, out_features=512, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=512, out_features=512, bias=False)
 