# Requirements

In [1]:
# mount at Google Drive & Change directory
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/jinwoo

Mounted at /content/drive
/content/drive/MyDrive/jinwoo


In [2]:
# install library: transformers, import_ipynb, openai
!pip install git+https://github.com/huggingface/transformers
!pip install import_ipynb
!pip install openai
!pip install sentencepiece
!pip install accelerate
!pip install bitsandbytes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-zthj_f0q
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-zthj_f0q
  Resolved https://github.com/huggingface/transformers to commit 888c4a2ae02fb4e6c4b53a4e6eb0b973a771362f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-

# Import Library & Declare Constant

In [3]:
# Drive Directory that downloaded pre-trained models - /content/drive/MyDrive/jinwoo/huggingface/...
import transformers
from transformers import Trainer, GenerationConfig

import torch
from torch.utils.data import Dataset

import copy
import logging
import os

from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

import import_ipynb
import utils

import warnings
warnings.filterwarnings(action='ignore')

ROOT_PATH = "/content/drive/MyDrive/jinwoo"
MODEL_PATH = "/content/drive/MyDrive/jinwoo/model"
DATA_PATH = "/content/drive/MyDrive/jinwoo/dataset/ko_alpaca_data.json"
OUTPUT_PATH = "/content/drive/MyDrive/jinwoo/result"

MODEL_NAME = "decapoda-research/llama-7b-hf"
OUTPUT_DIR = ROOT_PATH

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"

PROMPT_DICT = {
    # 추가적인 input data가 있는 경우,
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    # 추가적인 input data가 없는 경우,
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}

importing Jupyter notebook from utils.ipynb


# Arguments Class

In [4]:
# Parser Arguments - Path of model
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")

# Parser Arguments - Path of dataset
@dataclass
class DataArguments:
    data_path: str = field(default=None, metadata={"help": "Path to the training data."})

# Parser Arguments - Using args for training
@dataclass
class TrainingArguments(transformers.TrainingArguments):
    """    Input Arguments using for training.    """
    
    optim: str = field(default="adamw_torch")
    num_train_epochs: float = field(default=1)
    save_strategy: str = field(default='epoch')
    dataloader_num_workers: int = field(default=8)
    cache_dir: Optional[str] = field(default=None)
    auto_find_batch_size: bool = field(default=True)
    model_max_length: int = field(
        default=512, 
        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},)
    

# Tokenizer Functions

In [5]:
# Save - model's states
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """    Collects the state dict and dump to disk.    """
    # 모델의 weight를 dictionary 형태로 저장
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        # weight들이 cuda에 있으므로, cpu에 위치하게 바꿈.
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        # 가중치를 저장함.
        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa

In [6]:
# Resize - Tokenizer & Embedding
def smart_tokenizer_and_embedding_resize(special_tokens_dict: Dict, tokenizer: transformers.PreTrainedTokenizer, 
                                         model: transformers.PreTrainedModel):
    """
    Resize tokenizer and embedding.
    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

In [7]:
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """    Tokenize a list of strings.    """
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

In [8]:
def preprocess(sources: Sequence[str], targets: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """    Preprocess the data by tokenizing.    """
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

# Dataset Class & Function

In [9]:
class SupervisedDataset(Dataset):
    """    Dataset for supervised fine-tuning.    """

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        # alpaca_data.json 파일에서 list 형태로 데이터 읽음.
        logging.warning("Loading data...")
        list_data_dict = utils.jload(data_path)

        # prompt 생성
        logging.warning("Formatting inputs...")
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]

        #
        sources = [
            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]
        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

In [10]:
@dataclass
class DataCollatorForSupervisedDataset(object):
    """    Collate examples for supervised fine-tuning.    """

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [11]:
def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
    """    Make dataset and collator for supervised fine-tuning.    """
    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)

# Loading Model & Tokenizer

In [None]:
# arguments for loading model
parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
# (model, data, training) arguments
model_args, data_args, training_args = parser.parse_args_into_dataclasses(['--model_name_or_path', MODEL_NAME,
                                                                               '--output_dir', OUTPUT_PATH, 
                                                                               '--data_path', DATA_PATH,
                                                                               '--cache_dir', MODEL_PATH])
    
# pre-trained model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto",
)

# pre-trained tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=False,
)

if tokenizer.pad_token is None:
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
        tokenizer=tokenizer,
        model=model,
    )

if "llama" in model_args.model_name_or_path:
    tokenizer.add_special_tokens(
        {
            "eos_token": DEFAULT_EOS_TOKEN,
            "bos_token": DEFAULT_BOS_TOKEN,
            "unk_token": DEFAULT_UNK_TOKEN,
        }
    )

generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    num_beams=4,
)

print(f'DONE: loading Model and Tokenizer - Llama')


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

# Train & Test Function


In [None]:
def train():
    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
    trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)

def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, 
                paired with an input that provides further context. 
                Write a response that appropriately completes the request.
                ### Instruction:
                {instruction}

                ### Input:
                {input}

                ### Response:"""
    else:
        return f"""Below is an instruction that describes a task. 
                Write a response that appropriately completes the request.
                ### Instruction:
                {instruction}

                ### Response:"""

def evaluate(instruction, input=None):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        print("Response:", output.split("### Response:")[1].strip())

# main

In [None]:

if __name__ == "__main__":
    #train()
    evaluate(input("Instruction: "))