In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## load pretrained model

In [2]:
from transformers import LlamaForCausalLM
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('daily_tokenizer_0612')
model = LlamaForCausalLM.from_pretrained('daily_llama_0612')

model



Setting ds_accelerator to cuda (auto detect)


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(50000, 512, padding_idx=0)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=512, bias=False)
          (up_proj): Linear(in_features=512, out_features=1376, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_he

## get peft model

In [3]:
from peft import get_peft_model, LoraConfig, TaskType

In [4]:
list(TaskType)

[<TaskType.SEQ_CLS: 'SEQ_CLS'>,
 <TaskType.SEQ_2_SEQ_LM: 'SEQ_2_SEQ_LM'>,
 <TaskType.CAUSAL_LM: 'CAUSAL_LM'>,
 <TaskType.TOKEN_CLS: 'TOKEN_CLS'>]

In [5]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM,
                        inference_mode=False,
                        r=32,
                        lora_alpha=32,
                        lora_dropout=0.1)

model = get_peft_model(model, peft_config)
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(50000, 512, padding_idx=0)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=512, out_features=512, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=512, out_features=512, bias=False)
              (v_proj): Linear(


In [6]:
model.print_trainable_parameters()

trainable params: 262144 || all params: 64115200 || trainable%: 0.4088640447195049


## load dataset

In [7]:

from datasets import load_dataset

dataset_cate = load_dataset('heegyu/news-category-balanced-top10')

Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/heegyu___json/heegyu--news-category-balanced-top10-5f881f7cd497c7a8/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
categories = dataset_cate['train'].to_pandas().category.unique().tolist()
categories.sort()
categories = categories[:4]

dataset_cate = dataset_cate.filter(lambda element: element['category'] in categories)
dataset_cate

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/heegyu___json/heegyu--news-category-balanced-top10-5f881f7cd497c7a8/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-31c8659ca0784ee1.arrow


DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
        num_rows: 29026
    })
})

In [9]:
categories = [x.split(' ')[0].lower() for x in categories]
int2label_cate = {i: categories[i] for i in range(len(categories))}
label2int_cate = {int2label_cate[key]:key for key in int2label_cate}

In [10]:
def gen_label(element):
    category = element['category'].split(' ')[0].lower()
    return {'label': label2int_cate[category], 'category': category}

dataset_cate = dataset_cate.map(gen_label)
dataset_cate = dataset_cate['train'].train_test_split(test_size=0.1)
dataset_cate

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/heegyu___json/heegyu--news-category-balanced-top10-5f881f7cd497c7a8/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-7a735ffebc15a3a9.arrow


DatasetDict({
    train: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 26123
    })
    test: Dataset({
        features: ['link', 'headline', 'category', 'short_description', 'authors', 'date', 'label'],
        num_rows: 2903
    })
})

In [11]:
from datasets import DatasetDict
from datasets import concatenate_datasets
import random

prompt_format1_cate = """Given the article, what is the topic of the article? article: %s  answer: %s"""
prompt_format2_cate = """Determine the topic of the news article. article: %s answer: %s"""
prompt_format3_cate = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer: %s"""

prompts_cate = [prompt_format1_cate, prompt_format2_cate, prompt_format3_cate]

def gen_prompt_cate(element):
    prompt_format = prompts_cate[random.randint(0, len(prompts_cate)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'], int2label_cate[element['label']])})

train_cate = dataset_cate['train'].map(gen_prompt_cate, remove_columns=dataset_cate['train'].column_names)
train_dataset = train_cate

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

In [12]:
def tokenize(element):
    tokenizer.pad_token = tokenizer.eos_token
    outputs = tokenizer(
        element['input'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
        padding=True
    )

    return {"input_ids": outputs["input_ids"]}


context_length=128
tokenized_datasets = train_dataset.map(
    tokenize, batched=True, remove_columns=train_dataset.column_names
)
tokenized_datasets

Map:   0%|          | 0/26123 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 26123
})

## train

In [13]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 54])
attention_mask shape: torch.Size([5, 54])
labels shape: torch.Size([5, 54])


In [14]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="peft_llama",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1_000,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

In [15]:
trainer.train()



Step,Training Loss,Validation Loss


TrainOutput(global_step=816, training_loss=4.009374880323223, metrics={'train_runtime': 114.0944, 'train_samples_per_second': 228.96, 'train_steps_per_second': 7.152, 'total_flos': 362447135539200.0, 'train_loss': 4.009374880323223, 'epoch': 1.0})

## evaluation

In [16]:
model_load.eval()

prompt = """\
What is the topic of the collowing article? article: Boeing CEO says he assured Trump about Air Force One costs answer:"""
inputs = tokenizer(prompt, return_tensors="pt")
inputs.to(device)

# Generate
generate_ids = model_load.generate(input_ids=inputs.input_ids, max_length=40)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True,
                    clean_up_tokenization_spaces=False)[0]


"What is the topic of the collowing article? article: Boeing CEO says he assured Trump about Air Force One costs answer: business/entering the 'Socusing'  answer: business/"

In [17]:
tokenizer = AutoTokenizer.from_pretrained("daily_tokenizer_0612", padding_side='left')
prompt_format1 = """Given the article, what is the topic of the article? article: %s  answer:"""
prompt_format2 = """Determine the topic of the news article. article: %s answer:"""
prompt_format3 = """What is this article about? business/entertainment/food/healthy/parenting article: %s answer:"""

prompts = [prompt_format1, prompt_format2, prompt_format3]

def gen_valid_prompt_cate(element):
    prompt_format = prompts[random.randint(0, len(prompts)-1)]
    return DatasetDict({'input': prompt_format%(element['headline'])})




valid_dataset = dataset_cate['test'].map(gen_valid_prompt_cate)

context_length=128
valid_dataset = valid_dataset.map(
    tokenize, batched=True, remove_columns=['link', 'headline', 'category', 'short_description', 'authors', 'date', 'input']
)
valid_dataset

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Map:   0%|          | 0/2903 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 2903
})

In [18]:
from torch.utils.data import DataLoader

batch_size=4
val_ds = valid_dataset.select(range(100))
val_ds.set_format(type='torch')
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [20]:
model.eval()

val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']
    
    input_id = batch['input_ids'].to(device)
    
    pred = model.generate(input_ids=input_id, max_length=70)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]
    
    val_acc += acc(decoded_pred, label)
    
print("val acc: ", val_acc/len(val_dl.dataset))

100%|███████████████████████████████████████████| 25/25 [00:02<00:00,  9.25it/s]

val acc:  0.51





In [26]:
model.save_pretrained('peft_llama_adapter__')

## model size

In [27]:
import os
os.stat('peft_llama_adapter__/adapter_model.bin').st_size/(1024*1024)

1.0055246353149414

In [28]:
os.stat('daily_llama_0612/pytorch_model.bin').st_size/(1024*1024)

243.59453010559082

## load model

In [29]:
model_load =  LlamaForCausalLM.from_pretrained('daily_llama_0612')
model_load = PeftModel.from_pretrained(model_load, 'peft_llama_adapter')
model_load.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(50000, 512, padding_idx=0)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=512, out_features=512, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=512, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=512, out_features=512, bias=False)
              (v_proj): Linear(


In [19]:
from tqdm import tqdm
import re
import torch

def acc(pred,label):
    return torch.sum(torch.tensor(pred) == label.squeeze()).item()


In [20]:
model_load.eval()
model_load.to(device)

val_acc = 0

for step, batch in enumerate(tqdm(val_dl)):
    label = batch['label']
    
    input_id = batch['input_ids'].to(device)
    
    pred = model_load.generate(input_ids=input_id, max_length=70)
    decoded_pred = tokenizer.batch_decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    decoded_pred = [re.findall("answer: ([a-z]+)", x)[0] if re.findall("answer: ([a-z]+)", x) else 'none' for x in decoded_pred]
    decoded_pred = [label2int_cate[x] if x in label2int_cate else -1 for x in decoded_pred]
    
    val_acc += acc(decoded_pred, label)
    
print("val acc: ", val_acc/len(val_dl.dataset))

100%|███████████████████████████████████████████| 25/25 [00:02<00:00,  9.25it/s]

val acc:  0.51



