In [1]:
import transformers
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import sys
from typing import List

import fire
import torch
import torch.nn as nn
from datasets import load_dataset
import pandas as pd

from pylab import rcParams
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/proj_aki/anaconda3/envs/AI620/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/proj_aki/anaconda3/envs/AI620/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/proj_aki/anaconda3/envs/AI620/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [3]:
model = AutoModelForCausalLM.from_pretrained("beomi/KoAlpaca-Polyglot",
                                             torch_dtype=torch.float16,
                                             low_cpu_mem_usage=True,
                                             device_map='auto',
                                             load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-5.8b")

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

model.config.max_length = 1024
model.config.pad_token_id = 0

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3670016 || all params: 5888729088 || trainable%: 0.06232271760435925


In [5]:
data = load_dataset("json", data_files="Ko_En_QA_dataset.json")

Downloading and preparing dataset json/default to /home/proj_aki/.cache/huggingface/datasets/json/default-2750e3d35e9691e6/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/proj_aki/.cache/huggingface/datasets/json/default-2750e3d35e9691e6/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
CUTOFF_LEN = 256

In [7]:
def generate_prompt(data_point):    
    return f"""Below is an instruction that describes a task, paired with an input that provides further context.\n아래는 작업을 설명하는 명령어와 추가적 맥락을 제공하는 입력이 짝을 이루는 예제입니다.\n\nWrite a response that appropriately completes the request.\n요청을 적절히 완료하는 응답을 작성하세요.\n\n
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""

In [8]:
def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [9]:
train_val = data["train"].train_test_split(
    test_size=int((data['train'].num_rows)*0.1), shuffle=True, seed=42
)
train_data = (
    train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/3743 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

In [10]:
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

BATCH_SIZE = 32
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300
OUTPUT_DIR = "experiments"

In [12]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    #target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3670016 || all params: 5888729088 || trainable%: 0.06232271760435925


In [13]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard" 
)

In [14]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [15]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

model = torch.compile(model)

trainer.train()
model.save_pretrained(OUTPUT_DIR)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,0.413,0.274726
100,0.1325,0.129422
150,0.1192,0.117835
200,0.1124,0.113358
250,0.1063,0.110799
300,0.1046,0.109183


There were missing keys in the checkpoint model loaded: ['base_model.model.base_model.model.gpt_neox.embed_in.weight', 'base_model.model.base_model.model.gpt_neox.layers.0.input_layernorm.weight', 'base_model.model.base_model.model.gpt_neox.layers.0.input_layernorm.bias', 'base_model.model.base_model.model.gpt_neox.layers.0.post_attention_layernorm.weight', 'base_model.model.base_model.model.gpt_neox.layers.0.post_attention_layernorm.bias', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.bias', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.masked_bias', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.rotary_emb.inv_freq', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.query_key_value.weight', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.query_key_value.bias', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.query_key_value.lora_A.default.weight', 'base_model.model.base_model.model.gpt_neox.la

There were unexpected keys in the checkpoint model loaded: ['base_model.model.base_model.model.gpt_neox.layers.0.attention.query_key_value.lora_A.weight', 'base_model.model.base_model.model.gpt_neox.layers.0.attention.query_key_value.lora_B.weight', 'base_model.model.base_model.model.gpt_neox.layers.1.attention.query_key_value.lora_A.weight', 'base_model.model.base_model.model.gpt_neox.layers.1.attention.query_key_value.lora_B.weight', 'base_model.model.base_model.model.gpt_neox.layers.2.attention.query_key_value.lora_A.weight', 'base_model.model.base_model.model.gpt_neox.layers.2.attention.query_key_value.lora_B.weight', 'base_model.model.base_model.model.gpt_neox.layers.3.attention.query_key_value.lora_A.weight', 'base_model.model.base_model.model.gpt_neox.layers.3.attention.query_key_value.lora_B.weight', 'base_model.model.base_model.model.gpt_neox.layers.4.attention.query_key_value.lora_A.weight', 'base_model.model.base_model.model.gpt_neox.layers.4.attention.query_key_value.lora_B

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("lizim/alpaca-ko-en-translation", use_auth_token=True)

CommitInfo(commit_url='https://huggingface.co/lizim/alpaca-ko-en-translation/commit/ced4a9fe5db69eb997460b20e9e8db48632b5c04', commit_message='Upload model', commit_description='', oid='ced4a9fe5db69eb997460b20e9e8db48632b5c04', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
#!git clone https://github.com/tloen/alpaca-lora.git
%cd alpaca-lora
!git checkout a48d947

/home/proj_aki/AI620-final/alpaca-lora
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
M	generate.py
HEAD is now at a48d947 把中文LoRA放在一起


In [19]:
!python generate.py \
    --load_8bit \
    --base_model 'beomi/KoAlpaca-Polyglot' \
    --lora_weights 'lizim/alpaca-ko-en-translation' \
    --share_gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/proj_aki/anaconda3/envs/AI620/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/proj_aki/anaconda3/envs/AI620/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/proj_aki/anaconda3/envs/AI620/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:09<00:00,  4.96s/it]
R