In [1]:
import transformers
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
import sys
from typing import List

import torch
import torch.nn as nn
from datasets import load_dataset
import pandas as pd

from pylab import rcParams
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/daniel/anaconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/daniel/anaconda3/envs/llama/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/daniel/anaconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [None]:
model = AutoModelForCausalLM.from_pretrained("beomi/KoAlpaca-Polyglot",
                                             torch_dtype=torch.float16,
                                             low_cpu_mem_usage=True,
                                             device_map='auto',
                                             load_in_8bit=True)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/polyglot-ko-5.8b")

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

model.config.max_length = 1024
model.config.pad_token_id = 0

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3670016 || all params: 5888729088 || trainable%: 0.06232271760435925


In [88]:
data = load_dataset("json", data_files="Ko_En_QA_dataset.json")

Downloading and preparing dataset json/default to /home/daniel/.cache/huggingface/datasets/json/default-e8bfeb01ba659f57/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/daniel/.cache/huggingface/datasets/json/default-e8bfeb01ba659f57/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [89]:
CUTOFF_LEN = 256

In [111]:
def generate_prompt(data_point):    
    if data_point['translation'] is not None:
        return f"""{data_point['instruction']}

### Source Sentence:
{data_point['src_sentence']}

### Past Question to User:
{data_point['q_to_user']}

### Past User's Response:
{data_point['users_response']}

### Translation:
{data_point['translation']}
"""
    else:
        return f"""{data_point['instruction']}

### Source Sentence:
{data_point['src_sentence']}

### Question to User:
{data_point['q_to_user']}
"""

In [120]:
def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [121]:
train_val = data["train"].train_test_split(
    test_size=int((data['train'].num_rows)*0.1), shuffle=True, seed=42
)
train_data = (
    train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].shuffle().map(generate_and_tokenize_prompt)
)

Loading cached split indices for dataset at /home/daniel/.cache/huggingface/datasets/json/default-e8bfeb01ba659f57/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-014e4c15985fdd72.arrow and /home/daniel/.cache/huggingface/datasets/json/default-e8bfeb01ba659f57/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-eee494f5195d05e8.arrow


Map:   0%|          | 0/1691 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [11]:
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

BATCH_SIZE = 32
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 30000
OUTPUT_DIR = "experiments"

In [12]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    #target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3670016 || all params: 5888729088 || trainable%: 0.06232271760435925


In [131]:
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=50,
    save_steps=50,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard" 
)

In [132]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [134]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

model = torch.compile(model)

trainer.train()
model.save_pretrained(OUTPUT_DIR)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("lizim/alpaca-ko-en-translation", use_auth_token=True)

CommitInfo(commit_url='https://huggingface.co/lizim/alpaca-ko-en-translation/commit/ced4a9fe5db69eb997460b20e9e8db48632b5c04', commit_message='Upload model', commit_description='', oid='ced4a9fe5db69eb997460b20e9e8db48632b5c04', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
lora = torch.load("../../checkpoint/pytorch_model.bin", map_location='cpu')

In [26]:
model.load_state_dict(lora, strict=False);

In [122]:
prompt = generate_and_tokenize_prompt(train_data[0])

In [128]:
print(tokenizer.decode(prompt['input_ids'][:-9]))

Translate the Korean sentence in English. If there is a missing gender information, ask a question instead.
다음 한국어 문장을 영어로 번역하세요. 만약 성별 정보가 부족하다면, 대신 질문을 하세요.

### Source Sentence:
의사가 환자의 처방전을 쓰고 있어요.

### Question to User:



In [129]:
output = model.generate(
    inputs=torch.tensor([prompt['input_ids'][:-9]]).cuda(), 
    do_sample=True,
    max_new_tokens=100
)
print(tokenizer.decode(output.cpu().tolist()[0]))

Translate the Korean sentence in English. If there is a missing gender information, ask a question instead.
다음 한국어 문장을 영어로 번역하세요. 만약 성별 정보가 부족하다면, 대신 질문을 하세요.

### Source Sentence:
의사가 환자의 처방전을 쓰고 있어요.

### Question to User:
사용자 이름과 국가와의 관련성이 없으므로, 사용자 이름을 대체하기 위해 다음 구를 대신 사용하세요.

Item: {
    discovery: {
    price: {
    $100}
}

### 응답: The doctor writes the prescription on the patient.<|endoftext|>


In [13]:
!git clone https://github.com/tloen/alpaca-lora.git
%cd alpaca-lora
!git checkout a48d947

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Cloning into 'alpaca-lora'...
remote: Enumerating objects: 607, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 607 (delta 28), reused 33 (delta 19), pack-reused 556[K
Receiving objects: 100% (607/607), 27.78 MiB | 7.47 MiB/s, done.
Resolving deltas: 100% (360/360), done.
/home/daniel/AI620_2023spring/alpaca-lora
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: switching to 'a48d947'.

You are in 

In [17]:
!cp ../generate.py .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
!python generate.py \
    --load_8bit \
    --base_model 'beomi/KoAlpaca-Polyglot' \
    --lora_weights 'lizim/alpaca-ko-en-translation' \
    --share_gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/daniel/anaconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)
CUDA SETUP: CUDA runtime path found: /home/daniel/anaconda3/envs/llama/lib/libcudart.so
CUDA SETUP: Highest compute ca