# Drive Mount & Dependency

In [1]:
# mount at Google Drive & Change directory
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/MyDrive/jinwoo

Mounted at /content/drive
/content/drive/MyDrive/jinwoo


In [2]:
# installing library
!pip install git+https://github.com/huggingface/transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install sentencepiece
!pip install accelerate
!pip install bitsandbytes
!pip install langchain
!pip install appdirs
!pip install gradio
!pip install datasets
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-tsni9567
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-tsni9567
  Resolved https://github.com/huggingface/transformers to commit f67dac97bdc63874f2288546b3fa87e69d2ea1c8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-rxd4zo2l
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-rxd4

# wandb

In [3]:
import wandb

wandb.init(project='odego-llama7b')
wandb.run.name = 'run-1'
wandb.config = {
    "learning_rate": 2e-5,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 32,
}

[34m[1mwandb[0m: Currently logged in as: [33modego-lab[0m ([33mkmou-odego[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Import & Constant

In [4]:
from transformers import Trainer, GenerationConfig
from transformers import LlamaTokenizer, LlamaForCausalLM

from typing import Optional, Dict, Sequence
from torch.utils.data import Dataset

import transformers
import torch
import os
import sys
import pandas as pd
from datasets import load_dataset
import json
import copy

import warnings
warnings.filterwarnings(action='ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

IGNORE_INDEX = -100

DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"

PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context.\n"
        "아래는 작업을 설명하는 명령어와 추가적 맥락을 제공하는 입력이 짝을 이루는 예제입니다.\n\n"
        "Write a response that appropriately completes the request.\n요청을 적절히 완료하는 응답을 작성하세요.\n\n"
        "### Instruction(명령어):\n{instruction}\n\n### Input(입력):\n{input}\n\n### Response(응답):"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task.\n"
        "아래는 작업을 설명하는 명령어입니다.\n\n"
        "Write a response that appropriately completes the request.\n명령어에 따른 요청을 적절히 완료하는 응답을 작성하세요.\n\n"
        "### Instruction(명령어):\n{instruction}\n\n### Response(응답):"
    ),
}

In [12]:
ROOT_PATH = "/content/drive/MyDrive/jinwoo"

BASE_MODEL_NAME = "decapoda-research/llama-7b-hf"

DATA_PATH = "/content/drive/MyDrive/jinwoo/dataset/ko_alpaca_data.json"
CACHE_MODEL_PATH = "/content/drive/MyDrive/jinwoo/model"
OUTPUT_PATH = "/content/drive/MyDrive/jinwoo/odego"

training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    bf16=True,
    logging_steps=1,
    optim="adamw_torch",
    evaluation_strategy="no",
    save_strategy="steps",
    save_steps=500,
    weight_decay=0.,
    warmup_ratio=0.03,
    output_dir=OUTPUT_PATH,
    save_total_limit=2,
    report_to="wandb"
)

# Functions

In [6]:
import io

def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode)
    return f


def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jdump(obj, f, mode="w", indent=4, default=str):
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

In [7]:
# save model states
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """    Collects the state dict and dump to disk.    """
    # 모델의 weight를 dictionary 형태로 저장
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        # weight들을 cuda에서 cpu에 위치하게 바꿈.
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        # weights 저장함.
        trainer._save(output_dir, state_dict=cpu_state_dict)

# Resize - Tokenizer & Embedding
def smart_tokenizer_and_embedding_resize(special_tokens_dict: Dict, 
                                         tokenizer: transformers.PreTrainedTokenizer, 
                                         model: transformers.PreTrainedModel):
    # 인코더에 special token dictionary를 추가함.
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    # 토큰을 추가하면 vocab size가 달라지므로 길이를 수정함.
    model.resize_token_embeddings(len(tokenizer))

    # 토큰이 늘어나면 embedding size를 조절함.
    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

def _tokenize_fn(strings: Sequence[str], 
                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        ) for text in strings]

    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list]

    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

def preprocess(sources: Sequence[str], 
               targets: Sequence[str], 
               tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    # source(input)과 target을 합쳐서 examples에 저장함.
    examples = [s + t for s, t in zip(sources, targets)]
    # 토큰화된 결과를 저장함.
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

In [20]:
class SupervisedDataset(Dataset):
    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        # data path에 존재하는 .json 파일에서 list 형태로 데이터 읽음.
        list_data_dict = jload(data_path)

        # prompt 생성
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
    
        # data list에서 example을 하나씩 로드하고, 입력이 있는 것과 없는 것을 구분해서 sources에 저장함.
        sources = [
            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]
        # target 즉, label을 저장함.
        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

        # 전처리한 데이터를 dictionary에 저장함.
        data_dict = preprocess(sources, targets, tokenizer)

        # input_ids: 입력 문장, labels: 출력 문장
        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])

from dataclasses import dataclass      
@dataclass
class DataCollatorForSupervisedDataset(object):
    tokenizer: transformers.PreTrainedTokenizer
    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))

        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX)
        
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_path=DATA_PATH) -> Dict:
    train_dataset = SupervisedDataset(data_path=data_path, tokenizer=tokenizer)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)

# Train

In [None]:
# loading model
model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    cache_dir=CACHE_MODEL_PATH,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    device_map="auto",
)

# loading tokenizer
tokenizer = LlamaTokenizer.from_pretrained(
    BASE_MODEL_NAME,
    cache_dir=CACHE_MODEL_PATH,
    model_max_length=512,
    padding_side="right",
    use_fast=False,
)

In [23]:
def train(model, tokenizer):
    if tokenizer.pad_token is None:
        smart_tokenizer_and_embedding_resize(
            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
            tokenizer=tokenizer,
            model=model,
        )
    tokenizer.add_special_tokens(
        {
            "eos_token": DEFAULT_EOS_TOKEN,
            "bos_token": DEFAULT_BOS_TOKEN,
            "unk_token": DEFAULT_UNK_TOKEN,
        }
    )

    data_module = make_supervised_data_module(tokenizer=tokenizer, data_path=DATA_PATH)
    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_arguments, **data_module)
    trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=OUTPUT_PATH)

# main

In [None]:
train(model, tokenizer)

# Upload HuggingFace

In [None]:
# Repository 생성 & model upload
token = open(ROOT_PATH+'/hf_token.txt', 'r')
AUTH_TOKEN = token.readline()
REPO_NAME = "odego-llama-7b"
 
# Upload to Huggingface Hub
model.push_to_hub(
    REPO_NAME, 
    use_temp_dir=True, 
    use_auth_token=AUTH_TOKEN
)