In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import TextStreamer
from trl import SFTTrainer

from torch.utils.data import Dataset, DataLoader
from accelerate import Accelerator
import argparse, logging, os
from tqdm import tqdm

# LoRA modules
from peft.mapping import get_peft_model
from peft.tuners.lora import LoraConfig
from peft.utils.peft_types import TaskType

In [5]:
# logger initiation

In [6]:
# dataset class
class MyDataset(Dataset):
    def __init__(self, dataset_dir):
        pass

    def __len__(self):
        pass

    def __getitem__(self):
        pass


class FormattingFunction:
    def __init__(self):
        self.instruction = """
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        당신의 역할은 한국어로 답변하는 **한국어 AI 어시트턴트**입니다. 주어진 질문에 대해 한국어로 답변해주세요.<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        아래 질문을 한국어로 정확하게 답변해주세요. **질문**: {}<|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>\n\n{}<|eot_id|>
        <|end_of_text|>"""
        pass

    def __call__(self, examples):

        final_texts = []
        for i in tqdm(range(len(examples["input"]))):
            final_text = self.instruction.format(
                examples["input"][i], examples["output"][i]
            )
            final_texts.append(final_text)

        return final_texts

In [7]:
# model class
class ModelInitiator:
    def __init__(self, model_checkpoint, tokenizer_checkpoint):
        self.model_checkpoint = model_checkpoint
        self.tokenizer_checkpoint = tokenizer_checkpoint

    def __call__(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_checkpoint,
            device_map="auto",
            trust_remote_code=True,
            attn_implementation="flash_attention_2",
        )

        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_checkpoint)
        tokenizer.pad_token = tokenizer.pad_token
        tokenizer.add_special_tokens({"pad_token": "<pad>"})
        model.resize_token_embeddings(len(tokenizer))

        return model, tokenizer

In [1]:
def main(args):
    # initialize model
    if args.tokenizer == None:
        modelInitiator = ModelInitiator(args.model, args.model)
    else:
        modelInitiator = ModelInitiator(args.model, args.tokenizer)

    model, tokenizer = modelInitiator()

    # initialize loraconfig
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
    )
    model = get_peft_model(model, peft_config=peft_config)
    model.print_trainable_parameters()

    # prepare dataset
    train_dataset = MyDataset(args.train_dataset_dir)
    eval_dataset = MyDataset(args.eval_dataset_dir)

    # initialize training arguments
    trainingarguments = TrainingArguments(
        output_dir=args.dir, 
        save_strategy="epoch",
        eval_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        weight_decay=0.01,
        save_total_limit=1
        num_train_epochs=args.epochs,
        fp16=True,
        logging_steps=1,
        metric_for_best_model="train_loss",
        load_best_model_at_end=False,
        seed=42,
        lr_scheduler_type="linear"
    )

    #preparing others
    formattingfunction = FormattingFunction()

    # collator_fn = DataCollatorForLanguageModeling(
    #     args.tokenizer, mlm=False
    # )  # mlm=False: Autoregressive

    trainer = SFTTrainer(
        model=model,
        args=trainingarguments,
        train_dataset=train_dataset,
        formatting_func = formattingfunction,
        max_seq_len = 2048,
        tokenizer = tokenizer
    )

    trainer.train()

In [42]:
types = "jupyter_inline"
if __name__ == "__main__":

    if types == "argumentparser":
        parser = argparse.ArgumentParser()
        parser.add_argument("--model", default=None, type=str, required=True)
        parser.add_argument("--tokenizer", default=None, type=str, required=False)
        parser.add_argument("--output_dir", default=None, type=str, required=True)
        parser.add_argument("--train_batch_size", default=None, type=str, required=True)
        parser.add_argument("--eval_batch_size", default=None, type=str, required=True)
        parser.add_argument("--epochs", default=None, type=str, required=True)
        parser.add_argument(
            "--train_dataset_dir", default=None, type=str, required=True
        )
        parser.add_argument("--eval_dataset_dir", default=None, type=str, required=True)

        args = parser.parse_args()

    if types == "jupyter_inline":
        model_checkpoint = ""
        tokenizer_checkpoint = ""
        more_args_value = ""
        output_dir_value = ""
        train_batch_size = ""
        eval_batch_size = ""
        epochs = ""
        train_dataset_dir = ""
        eval_dataset_dir = ""

        args = argparse.Namespace(
            model=model_checkpoint,
            tokenizer=tokenizer_checkpoint,
            output_dir=output_dir_value,
            train_batch_size=train_batch_size,
            eval_batch_size=eval_batch_size,
            epochs=epochs,
            train_dataset_dir=train_dataset_dir,
            eval_dataset_dir=eval_dataset_dir,
        )

    if types in ["argumentparser", "jupyter_inline"]:
        main(args)

a


In [2]:
import math

math.sqrt(2e-5 * 0.01)

0.00044721359549995795

In [3]:
math.sqrt(2e-5)

0.00447213595499958