In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    hf_argparser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm
2023-08-16 07:59:11.544960: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
dataset_name = "mlabonne/guanaco-llama2-1k"
new_model = "llama-2-7b-miniguanaco"

# QLoRA 매개 변수

## LoRA 어텐션 차원
lora_r = 64

## LoRA 스케일링을 위한 Alpha 매개 변수
lora_alpha = 16

## LoRA 층에 대하여 드롭 아웃 확률
lora_dropout = 0.1


# bitsandbytes 매개 변수

## 4 비트 정밀도 활성화
use_4bit = True

## 4 비트 기반 모델을 위해 dtype 계산
bnb_4bit_compute_dtype = "float16"

## 양자화 타입
bnb_4bit_quant_type = "nf4"

## 이중 양자화 활성화
use_nested_quant = False


# 학습 매개 변수

## 모델 예측값과 체크 포인트를 저장할 출력 디렉토리 지정
output_dir = "./results"

## 학습 에포크 횟수
num_train_epochs = 1

## fp16/bf16 학습 허용
fp16 = False
bf16 = False

## 학습할 때 GPU 1대당 배치 크기
per_device_train_batch_size = 4

## 평가할 때 GPU 1대당 배치 크기
per_device_eval_batch_size = 4

## 경사를 누적하는 업데이트 스텝의 횟수
gradient_accumulation_steps  =1

## 경사 체크 포인트 허용
gradient_checkpointing = True

## 최대 경사
max_grad_norm = 0.3

## 초기 학습률
learning_rate = 2e-4

## 편향/정규화 층 가중치를 제외한 모든 층에 적용할 가중치 감쇠
weight_decay = 0.001

## 사용할 최적화기
optim = "paged_adamw_32bit"

## 학습률 스케줄러(상수가 코사인보다 조금 낫습니다)
lr_scheduler_type = "constant"

## 학습 스텝 횟수
max_steps = -1

## 선형 웜업을 위한 스텝별 학습률 증가 비율(0부터 시작하여 지정한 학습률까지 설정값을 참고하여 학습률을 올립니다)
warmup_ratio = 0.03

## 시퀀스의 길이가 같은 것끼리 모읍니다
## 메모리를 절약할 수 있고 학습 속도를 상당히 증속할 수 있습니다
group_by_length = True

## 지정한 스텝마다 체크 포인트 저장
save_steps = 25

## 지정한 스텝마다 로그
logging_step = 25


# SFT 매개 변수

## 사용할 시퀀스의 최대 길이
max_seq_length = None

## 짧은 예시를 하느의 입력 시퀀스에 집어 넣어 효율성을 높입니다
packing = False

## GPU 0에서 모델 전체를 불러오도록 합니다
device_map = {"": 0}

In [3]:
cache_dir="/datadrive/wd/huggingface_cache"

In [4]:
# 데이터 세트 불러오기
dataset = load_dataset(
    dataset_name,
    split="train",
    cache_dir=os.path.join(cache_dir, "datasets")
)

In [5]:
# 설정한 QLoRA로 토크나이저와 모델 불러오기
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_8bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant
)

In [6]:
# GPU가 bfloat16와 호환되는지 확인
if compute_dtype == torch.float16 and use_4bit:
    major, _= torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [7]:
# 기본 모델 불러오기
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    cache_dir=os.path.join(cache_dir, "hub")
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.48s/it]


In [10]:
# LLaMA 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=os.path.join(cache_dir, "hub")
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading (…)okenizer_config.json: 100%|██████████| 746/746 [00:00<00:00, 6.51MB/s]
Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 682MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 35.2MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 21.0/21.0 [00:00<00:00, 209kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.90MB/s]




In [11]:
# LoRA 환경 설정 불러오기
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [12]:
# 학습 매개 변수 설정
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_step,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [13]:
# 지도 미세 조정 매개 변수 설정
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing
)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 6894.89 examples/s]


In [14]:
# 모델 학습
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.3323
50,1.6041
75,1.1907
100,1.4146
125,1.1584
150,1.3424
175,1.156
200,1.4406
225,1.1391
250,1.5084




TrainOutput(global_step=250, training_loss=1.3286535873413086, metrics={'train_runtime': 635.1795, 'train_samples_per_second': 1.574, 'train_steps_per_second': 0.394, 'total_flos': 1.7109131586207744e+16, 'train_loss': 1.3286535873413086, 'epoch': 1.0})

In [15]:
# 학습된 모델 저장
trainer.model.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [32]:
# 경고 무시
logging.set_verbosity(logging.CRITICAL)

In [33]:
# 학습된 모델을 사용하여 텍스트를 생성하는 파이프라인 구축
prompt = "What is a large language model?"
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200
)

In [34]:
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])



<s>[INST] What is a large language model? [/INST] A large language model is a type of artificial intelligence that is trained on a large dataset of text to generate human-like language. This can be used for a variety of applications, such as generating text for websites, chatbots, or even entire books.

The most well-known example of a large language model is BERT, which stands for Bidirectional Encoder Representations from Transformers. BERT is a type of transformer model that is trained on a large dataset of text and is able to generate human-like language. BERT has been used for a variety of applications, such as generating text for websites, chatbots, or even entire books.

Other examples of large language models include the Transformer model, which is a type of neural network that is trained on a large dataset of text and is able to generate human-like language. The Transformer model has


In [None]:
# FP16의 모델을 다시 불러온 뒤 LoRA 가중치와 병합
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem,usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map
)
model = PeftModel.from_pretrained(base_mode, new_model)
model = model.merge_and_unload()

In [None]:
# 토크나이저를 다시 불러 오고 저장
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)