In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install bitsandbytes==0.43.1
!pip install transformers==4.40.2
!pip install peft
!pip install accelerate
!pip install datasets==2.18.0

Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes==0.43.1)
  Using cached nvidia_cublas

In [None]:
# !pip install -q bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q datasets

First let's load the model we are going to use - GPT-neo-x-20B! Note that the model itself is around 40GB in half precision

In [None]:
# PROMPT_DICT = {
#     "prompt_input": (
#         "Below is an instruction that describes a task, paired with an input that provides further context.\n"
#         "아래는 작업을 설명하는 명령어와 추가적 맥락을 제공하는 입력이 짝을 이루는 예제입니다.\n\n"
#         "Write a response that appropriately completes the request.\n요청을 적절히 완료하는 응답을 작성하세요.\n\n"
#         "### Instruction(명령어):\n{instruction}\n\n### Input(입력):\n{input}\n\n### Response(응답):"
#     )
# }

In [3]:
PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{instruction}\n\n### Input(입력):\n{input}\n\n### Response(응답):"
    )
}

In [4]:
import json
from datasets import Dataset
from sklearn.model_selection import train_test_split

def load_dataset_from_json(json_file_path, test_size=0.2, random_state=42):
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 데이터 분할
    train_data, val_data = train_test_split(data, test_size=test_size, random_state=random_state)

    # Dataset 객체 생성
    train_dataset = Dataset.from_dict({"instruction": [item["instruction"] for item in train_data],
                                       "input": [item["input"] for item in train_data],
                                       "output": [item["output"] for item in train_data]})

    val_dataset = Dataset.from_dict({"instruction": [item["instruction"] for item in val_data],
                                     "input": [item["input"] for item in val_data],
                                     "output": [item["output"] for item in val_data]})

    return train_dataset, val_dataset

# JSON 파일 경로
json_file_path = "/content/drive/MyDrive/lv99/data_0601.json"


# train 및 validation 데이터 로드
train_data, val_data = load_dataset_from_json(json_file_path)


# 데이터 출력
print("Train dataset:")
print(train_data)

print("\nValidation dataset:")
print(val_data)

Train dataset:
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 300453
})

Validation dataset:
Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 75114
})


In [5]:
# data
tr_data = train_data.map(
    lambda x:
    {'text': f"### Instruction(명령어):\n{x['instruction']}\n\n### Input(입력):\n{x['input']}\n\n### Response(응답):{x['output']}<|endoftext|>" }
)
tr_data

Map:   0%|          | 0/300453 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 300453
})

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("beomi/KoAlpaca-Polyglot-12.8B")

model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/lv99/checkpoint-9000", quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/52.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/28 [00:00<?, ?it/s]

model-00001-of-00028.safetensors:   0%|          | 0.00/945M [00:00<?, ?B/s]

model-00002-of-00028.safetensors:   0%|          | 0.00/843M [00:00<?, ?B/s]

model-00003-of-00028.safetensors:   0%|          | 0.00/843M [00:00<?, ?B/s]

model-00004-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00005-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00006-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00007-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00008-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00009-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00010-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00011-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00012-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00013-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00014-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00015-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00016-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00017-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00018-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00019-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00020-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00021-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00022-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00023-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00024-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00025-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00026-of-00028.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

model-00027-of-00028.safetensors:   0%|          | 0.00/896M [00:00<?, ?B/s]

model-00028-of-00028.safetensors:   0%|          | 0.00/517M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/28 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [7]:
tr_data = tr_data.map(lambda samples: tokenizer(samples["text"]), batched=True)

Map:   0%|          | 0/300453 [00:00<?, ? examples/s]

In [8]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 6553600 || all params: 6607912960 || trainable%: 0.09917806181272702


Let's load a common dataset, english quotes, to fine tune our model on famous quotes.

Run the cell below to run the training! For the sake of the demo, we just ran it for few steps just to showcase how to use this integration with existing tools on the HF ecosystem.

In [11]:
from transformers import DataCollatorForLanguageModeling
import numpy as np
from typing import Any, Dict, List, Union


In [12]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode("### Response(응답):")

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
    )

In [None]:
import transformers
import torch.optim as optim

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tr_data,
    args=transformers.TrainingArguments(
        output_dir = "/content/drive/MyDrive/lv99/",
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        save_steps = 1000,
        num_train_epochs=10,
        learning_rate=2e-4,
        fp16=True,
        save_strategy = "steps",
        logging_steps=10,
        optim="paged_adamw_8bit"
    ),
    data_collator=data_collator,
)



model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train("/content/drive/MyDrive/lv99/checkpoint-9000")




Step,Training Loss
9010,0.2183
9020,0.2172
9030,0.2148
9040,0.2391
9050,0.2218
9060,0.2271
9070,0.2109
9080,0.2259
9090,0.2198
9100,0.2173




In [None]:
trainer.save_model("/content/drive/MyDrive/lv99/tr_model_0530")

In [None]:
model.save_pretrained("/content/drive/MyDrive/lv99/mo_model_0528")

In [None]:
model.eval()
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

In [None]:

# def gen(x):
#     a = PROMPT_DICT['prompt_input'].format(instruction=x, input='현재 좌표 (-224, 0, 515)')
#     input_ids = tokenizer.encode(a, return_tensors="pt")

#     gened = model.generate(
#         input_ids,
#         pad_token_id=tokenizer.eos_token_id,
#         max_new_tokens=256,
#         num_return_sequences=1,
#         early_stopping=True,
#         do_sample=False,
#         eos_token_id=2,
#     )
#     print(tokenizer.decode(gened[0]))


def gen(x):
    a = PROMPT_DICT['prompt_input'].format(instruction=x, input='(-224, 0, 515)')
    input_ids = tokenizer.encode(a, return_tensors="pt")

    gened = model.generate(
        input_ids,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=256,
        num_return_sequences=1,
        early_stopping=True,
        do_sample=False,
        eos_token_id=2,
    )
    response = tokenizer.decode(gened[0])
    return response.split('### Response(응답):')[1].strip()


In [None]:
a = gen("오른쪽으로 움직여서 촬영해줘")
print(a)

In [None]:
def slice_until_nth_occurrence(a, substring, n):
    index = -1
    for _ in range(n):
        index = a.find(substring, index + 1)  # 이전 발견된 인덱스 다음부터 검색합니다.
        if index == -1:
            break
    if index != -1:
        index += len(substring)  # "0)"의 인덱스를 찾고 그 뒤부터 슬라이스하기 위해 길이를 더해줍니다.
        return a[:index]
    else:
        return None

substring = "0)"

n = int(input("원하는 등장 횟수를 입력하세요 (1부터 5까지 가능): "))
if 1 <= n <= 5:  # 입력 범위를 확인합니다.
    result = slice_until_nth_occurrence(a, substring, n)
    if result is not None:
        print(f"{result}")
    else:
        print(f"{n}번째로 나오는 '0)'를 찾을 수 없습니다.")
else:
    print("잘못된 입력입니다. 1부터 5까지의 정수를 입력하세요.")

원하는 등장 횟수를 입력하세요 (1부터 5까지 가능): 1


NameError: name 'a' is not defined

In [None]:
gen("오른쪽으로 빠르게 움직여서 촬영해줘")

'movel(posx(-224, 100, 515), vel=80) movel(posx(-224, 100, 515), vel=50) movel(posx(-224, 100, 515), vel='

In [None]:
gen("왼쪽으로 움직여서 촬영해줘")



'movel(posx(-224, -100, 515), vel=50) movel(posx(-224, -200, 515), vel=50) movel(posx(-224, -100, 515),'

In [None]:
gen("앞으로 움직이며 촬영해줘")

'movel(posx(-324, 0, 515), vel=50) movel(posx(-424, 0, 515), vel=50) movel(posx(-324, 0, 515), vel='

In [None]:
gen("왼쪽으로 움직이고 앞으로 움직여서 촬영해줘")

In [None]:
gen("줌인하고 오른쪽으로 측면을 촬영한 다음 다시 중앙으로 와줘")

'movel(posx(-324, 0, 465), vel=50), movel(posx(-324, 200, 465), vel=50) movel(posx(-324, 0, 465), vel='

In [None]:
gen("오른쪽으로 슬라이드샷 촬영해줘")

In [None]:
gen("왼쪽으로 슬라이드샷 촬영해줘")

In [None]:
gen("아크샷으로 천천히 촬영해줘")

In [None]:
gen("앞으로 달리샷 해줘")

In [None]:
gen("줌인하면서 촬영해줘")

In [None]:
gen("피자의 윗면이 다 보이게 빠르게 촬영해줘")

'현재 좌표 (-224, 0, 515)를 가진 위치에서 피자의 윗면이 보이게 찍어줘.\n\n### Instruction(명령어):\n피자의 윗면이 보이게 찍어줘\n\n### Input(입력):\n현재 좌표 (-224, 0, 515)'

In [None]:
gen("시계의 세밀한 부분이 잘 드러나게 가까이 촬영해줘")

'현재 좌표 (-224, 0, 515)를 가진 위치에서 시계의 세밀한 부분이 잘 촬영되도록 촬영해 주세요.<|endoftext|>'

In [None]:
gen("창문 측면에 부착된 라벨과 내부 풍경을 빠르게 촬영해")

'좌표 (-424, -20, 515)\n\n### Input(입력):\n(-424, -20, 515)'

In [None]:
gen("보석 세공 작업을 신속하게 세밀하게 줌인하여 디테일을 선명하게 포착해주세요")

'movel(posx(-424, 0, 445), vel=80) movel(posx(-524, 0, 445), vel=80) movel(posx(-424, 0, 445), vel=80)<|endoftext|>'

In [None]:
gen("붐샷으로 아래쪽 끝까지 빠르게 가면서 촬영해줘")

'movel(posx(-224, 0, 400), vel=80) movel(posx(-224, 0, 400), vel=80) movel(posx(-224, 0, 400), vel=80) movel(posx(-224, 0, 400), vel=80) movel(posx(-224, 0, 400), vel=80)<|endoftext|>'

In [None]:
gen("줌인으로 가장 물제와 가깝게 가면서 천천히 촬영해줘")

'movel(posx(-324, 0, 465), vel=20) movel(posx(-424, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0, 465), vel=20) movel(posx(-324, 0,'

In [None]:
gen("아크샷으로 멀리서 역동적으로 촬영해줘")

'현재좌표 (-224, -20, 515)\n\n### Instruction(명령어):\n아크샷으로 역동적으로 촬영해줘\n\n### Input(입력):\n현재좌표 (-224, -20, 515)'

In [None]:
gen("현재위치에서 아이레벨로 바라볼 수 있도록 움직여줘")

'movel(posx(-224, 0, 515), vel=50) movel(posx(-224, 0, 515), vel=50) movel(posx(-224, 0, 515), vel=50) movel(posx(-224, 0, 515), vel=50) movel(posx(-224, 0, 515), vel=50)<|endoftext|>'