In [None]:
!pip install transformers==4.40.1 datasets==2.19.0 accelerate==0.30.0 peft==0.10.0 bitsandbytes==0.43.1 -qqq

# GPU에 올라가는 데이터 살펴보기

## GPU 메모리 분해하기

**메모리 사용량 측정을 위한 함수 구현**  
- `torch.cuda.memory_allocated()`

In [11]:
import torch

In [12]:
def print_gpu_utilization():
    if torch.cuda.is_available():
        used_memory = torch.cuda.memory_allocated() / 1024**3
        print(f"GPU memory usage: {used_memory:.3f} GB")
    else:
        print("Change Runtime type to GPU.")

In [3]:
print_gpu_utilization()

GPU memory usage: 0.000 GB


**모델을 불러오고 GPU 메모리와 데이터 타입 확인**

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model_and_tokenizer(model_id, peft=None):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"": 0})
    print_gpu_utilization()
    return model, tokenizer

In [5]:
model_id = "EleutherAI/polyglot-ko-1.3b"

model, tokenizer = load_model_and_tokenizer(model_id)
print("Model data type: ", model.dtype)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



GPU memory usage: 2.599 GB
Model data type:  torch.float16


**그레이디언트와 옵티마이저 상태의 메모리 사용량을 계산하는 함수**

In [10]:
from transformers import AdamW
from torch.utils.data import DataLoader

# Check gradient memory usage
def estimate_memory_of_gradients(model):
    total_memory = 0
    for param in model.parameters():
        if param.grad is not None:
            total_memory += param.grad.nelement() * param.grad.element_size()
    return total_memory

# Check optimizer state memory usage
def estimate_memory_of_optimizer(optimizer):
    total_memory = 0
    for state in optimizer.state.values():
        for k, v in state.items():
            if torch.is_tensor(v):
                total_memory += v.nelement() * v.element_size()
    return total_memory

**모델의 학습 과정에서 메모리 사용량을 확인하는 train_model 정의**

In [9]:
def train_model(model, dataset, training_args):
    if training_args.gradient_checkpointing:
        model.gradient_checkpointing_enable()

    train_dataloader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size)
    optimizer = AdamW(model.parameters())
    gpu_utilization_printed = False

    for step, batch in enumerate(train_dataloader, start=1):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)

        loss = outputs.loss
        loss = loss / training_args.gradient_accumulation_steps
        loss.backward()

        if step % training_args.gradient_accumulation_steps == 0:
            optimizer.step()
            gradient_memory = estimate_memory_of_gradients(model)
            optimizer_memory = estimate_memory_of_optimizer(optimizer)
            if not gpu_utilization_printed:
                print_gpu_utilization()
                gpu_utilization_printed = True
            optimizer.zero_grad()
            
    print(f"Optimizer state memory usage: {optimizer_memory / (1024 ** 3):.3f} GB")
    print(f"Gradient memory usage: {gradient_memory / (1024 ** 3):.3f} GB")

**랜덤 데이터셋을 생성하는 make_dummy_dataset 정의**

In [7]:
import numpy as np
from datasets import Dataset

In [8]:
def make_dummy_dataset():
    seq_len, dataset_size = 256, 64
    dummy_data = {
        "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
        "labels": np.random.randint(100, 30000, (dataset_size, seq_len)),
    }
    dataset = Dataset.from_dict(dummy_data)
    dataset.set_format("pt")
    return dataset

**더 이상 사용하지 않는 GPU 메모리를 반환하는 cleanup 함수**

In [5]:
import gc

In [6]:
def cleanup():
    if 'model' in globals():
        del globals()['model']
    if 'dataset' in globals():
        del globals()['dataset']
    gc.collect()
    torch.cuda.empty_cache()

**GPU 사용량을 확인하는 gpu_memory_experiment 함수 정의**

In [4]:
from transformers import TrainingArguments, Trainer

def gpu_memory_experiment(batch_size,
                          gradient_accumulation_steps=1,
                          gradient_checkpointing=False,
                          model_id="EleutherAI/polyglot-ko-1.3b",
                          peft=None):
    
    print(f"Batch size: {batch_size}")
    model, tokenizer = load_model_and_tokenizer(model_id, peft=peft)
    if gradient_checkpointing == True or peft == 'qlora':
        model.config.use_cache = False

    dataset = make_dummy_dataset()

    training_args = TrainingArguments(
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        gradient_checkpointing=gradient_checkpointing,
        output_dir='./result',
        num_train_epochs=1
    )

    try:
        train_model(model, dataset, training_args)
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print(e)
        else:
            raise(e)
    finally:
        del model, dataset
        gc.collect()
        torch.cuda.empty_cache()
        print_gpu_utilization()

**배치 크기를 변경하며 메모리 사용량 측정**

In [13]:
cleanup()
print_gpu_utilization()

for batch_size in [4, 8, 16]:
    gpu_memory_experiment(batch_size)

    torch.cuda.empty_cache()

GPU memory usage: 0.000 GB
Batch size: 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU memory usage: 2.599 GB




GPU memory usage: 10.586 GB
Optimizer state memory usage: 4.961 GB
Gradient memory usage: 2.481 GB
GPU memory usage: 0.016 GB
Batch size: 8


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU memory usage: 2.615 GB
GPU memory usage: 11.113 GB




Optimizer state memory usage: 4.961 GB
Gradient memory usage: 2.481 GB
GPU memory usage: 0.016 GB
Batch size: 16


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU memory usage: 2.615 GB




GPU memory usage: 12.164 GB
Optimizer state memory usage: 4.961 GB
Gradient memory usage: 2.481 GB
GPU memory usage: 0.016 GB


# 단일 GPU 효율적으로 사용하기

## 그레이디언트 누적

**그레이디언트 누적 관련 부분** 

In [3]:
def train_model(model, dataset, training_args):
    if training_args.gradient_checkpointing:
        model.gradient_checkpointing_enable()

    train_dataloader = DataLoader(dataset, batch_size=training_args.per_device_train_batch_size)
    optimizer = AdamW(model.parameters())
    model.train()
    gpu_utilization_printed = False

    for step, batch in enumerate(train_dataloader, start=1):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / training_args.gradient_accumulation_steps # 4일 경우, 손실을 4로 나눠서 역전파를 수행
        loss.backward()

        if step % training_args.gradient_accumulation_steps == 0: # 배치 크기가 4배로 커진 것과 동일한 효과
            optimizer.step()
            gradients_memory = estimate_memory_of_gradients(model)
            optimizer_memory = estimate_memory_of_optimizer(optimizer)
            if not gpu_utilization_printed:
                print_gpu_utilization()
                gpu_utilization_printed = True
            optimizer.zero_grad()

    print(f"Optimizer state memory usage: {optimizer_memory / (1024 ** 3):.3f} GB")
    print(f"Gradient memory usage: {gradients_memory / (1024 ** 3):.3f} GB")

**그레이디언트 누적을 적용했을 때의 메모리 사용량**

In [16]:
cleanup()
print_gpu_utilization()

gpu_memory_experiment(batch_size=4, gradient_accumulation_steps=4)

torch.cuda.empty_cache()

GPU memory usage: 8.104 GB
Batch size: 4


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU memory usage: 10.703 GB




GPU memory usage: 18.674 GB
Optimizer state memory usage: 4.961 GB
Gradient memory usage: 2.481 GB
GPU memory usage: 8.104 GB


## 그레이디언트 체크포인팅

**그레이디언트 체크포인팅 사용 시 메모리 사용량**

In [12]:
cleanup()
print_gpu_utilization()

GPU memory usage: 0.000 GB


In [17]:
gpu_memory_experiment(batch_size=16, gradient_checkpointing=True)

torch.cuda.empty_cache()

Batch size: 16


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GPU memory usage: 5.198 GB




GPU memory usage: 12.888 GB
Optimizer state memory usage: 4.961 GB
Gradient memory usage: 2.481 GB
GPU memory usage: 2.615 GB


# 효율적인 학습 방법(PEFT): LoRA

## LoRA 학습 사용하기

**모델을 불러오면서 LoRA 적용하기**

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model

def load_model_and_tokenizer(model_id, peft=None):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"": 0})
    elif peft == 'lora':
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"": 0})
        lora_config = LoraConfig(
                        r=8,
                        lora_alpha=32,
                        target_modules=["query_key_value"],
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM"
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    print_gpu_utilization()

    return model, tokenizer

In [22]:
cleanup()
print_gpu_utilization()

gpu_memory_experiment(batch_size=16, peft='lora')

torch.cuda.empty_cache()

GPU memory usage: 0.000 GB
Batch size: 16


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 1,572,864 || all params: 1,333,383,168 || trainable%: 0.11796039111242178
GPU memory usage: 2.602 GB




GPU memory usage: 4.732 GB
Optimizer state memory usage: 0.006 GB
Gradient memory usage: 0.003 GB
GPU memory usage: 0.016 GB


**4비트 양자화 모델 불러오기**

In [14]:
from transformers import BitsAndBytesConfig

In [23]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(30080, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear4bit(in_features=2048, out_features=6144, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear4bit(in_features=8192, out_features=204

**QLoRA 모델 불러오기 추가**

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

2024-11-15 00:32:29.189149: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-15 00:32:29.283063: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-15 00:32:29.287242: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-11-15 00:32:29.287251: I tensorflow/stream_executor/cuda

In [2]:
def load_model_and_tokenizer(model_id, peft=None):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if peft is None:
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})

    elif peft == 'lora':
        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map={"":0})
        lora_config = LoraConfig(
                        r=8,
                        lora_alpha=32,
                        target_modules=["query_key_value"],
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM"
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

    elif peft == 'qlora':
        lora_config = LoraConfig(
                        r=8,
                        lora_alpha=32,
                        target_modules=["query_key_value"],
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM"
        )
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
            
    
    print_gpu_utilization()
    return model, tokenizer

**QLoRA를 적용했을 때의 GPU 메모리 사용량 확인**

In [15]:
cleanup()
print_gpu_utilization()

gpu_memory_experiment(batch_size=16, peft='qlora')

torch.cuda.empty_cache()

GPU memory usage: 0.000 GB
Batch size: 16


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 1,572,864 || all params: 1,333,383,168 || trainable%: 0.11796039111242178
GPU memory usage: 1.167 GB




GPU memory usage: 6.972 GB
Optimizer state memory usage: 0.012 GB
Gradient memory usage: 0.006 GB
GPU memory usage: 0.016 GB
