<a href="https://colab.research.google.com/github/JaehoHoya/llama3_tuning/blob/main/FITIZEN_FINETUNING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [5]:
!pip uninstall -y xformers
!pip install xformers


Found existing installation: xformers 0.0.28.post3
Uninstalling xformers-0.0.28.post3:
  Successfully uninstalled xformers-0.0.28.post3
Collecting xformers
  Using cached xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
Installing collected packages: xformers
Successfully installed xformers-0.0.28.post3


In [9]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [11]:
alpaca_prompt = """아래는 작업을 설명하는 지시사항입니다. 입력된 내용을 바탕으로 적절한 응답을 작성하세요.
### 지시사항:
아래 입력에 대한 적절한 응답을 제공하세요.
### 입력:
{input}
### 응답:
{response}
"""

EOS_TOKEN = tokenizer.eos_token  # EOS_TOKEN 추가 필요

def formatting_prompts_func(examples):
    inputs = examples["instruction"]
    responses = examples["output"]
    texts = []

    for input, response in zip(inputs, responses):
        # EOS_TOKEN 추가
        text = alpaca_prompt.format(input=input, response=response) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

from datasets import load_dataset

dataset = load_dataset("jeongjaeho/FITIZEN", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        num_train_epochs=15,  # 에폭 수를 10으로 설정
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/131 [00:00<?, ? examples/s]

In [13]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L4. Max memory = 22.168 GB.
19.281 GB of memory reserved.


In [3]:
pip install --upgrade xformers


Collecting xformers
  Using cached xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.28.post3-cp310-cp310-manylinux_2_28_x86_64.whl (16.7 MB)
Installing collected packages: xformers
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.26.post1
    Uninstalling xformers-0.0.26.post1:
      Successfully uninstalled xformers-0.0.26.post1
Successfully installed xformers-0.0.28.post3


In [9]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())


2.5.1+cu124
True


In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 131 | Num Epochs = 15
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 240
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.5606
2,0.4589
3,0.5618
4,0.4239
5,0.4751
6,0.5243
7,0.525
8,0.6009
9,0.4617
10,0.4338


In [17]:
# alpaca_prompt 정의
alpaca_prompt = """아래는 작업을 설명하는 지시사항입니다. 입력된 내용을 바탕으로 적절한 응답을 작성하세요.
### 지시사항:
{instruction}
### 입력:
{input}
### 응답:
"""

# FastLanguageModel 설정 (이 부분은 그대로 유지)
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# 입력 준비
instruction = "당신은 FITIZEN의 정보와 운동 정보,운동루틴 운동 식단에 대해 잘알고 있는 전문가입니다. "
input_text = "FITIZEN웹사이트 만든 개발자 누구야 "

# 토큰화
inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction=instruction,
            input=input_text
        )
    ],
    return_tensors="pt"
).to("cuda")

# 생성
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# 디코딩 및 출력
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(generated_text)

아래는 작업을 설명하는 지시사항입니다. 입력된 내용을 바탕으로 적절한 응답을 작성하세요.
### 지시사항:
당신은 FITIZEN의 정보와 운동 정보,운동루틴 운동 식단에 대해 잘알고 있는 전문가입니다. 
### 입력:
FITIZEN웹사이트 만든 개발자 누구야 
### 응답:
FITIZEN 웹사이트는 연주승, 정재호, 박성재가 개발한 3분할 팀이 맡아 만들었습니다.



In [18]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho", tokenizer, quantization_method = "q4_k_m", token = "")

if True:
    model.push_to_hub_gguf(
        "jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho", # Change hf to your username!
        tokenizer,
        quantization_method = "q8_0",
        token = "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", # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 36.2 out of 52.96 RAM for saving.


100%|██████████| 32/32 [00:01<00:00, 16.19it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho into q8_0 GGUF format.
The output location will be /content/jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho/unsloth.Q8_0.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama3-fitizen-Ko-8b-Instruct-jaeho
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
I

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/jeongjaeho/Llama3-fitizen-Ko-8b-Instruct-jaeho


In [7]:
import os
import torch
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
from huggingface_hub import login

login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [4]:
# set base model path
# base_model = "beomi/Llama-3-Open-Ko-8B"
base_model = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
# set new model path
new_model = "Llama3-fitizen-Ko-8b-jaeho-meta"

In [5]:
# dataset_namehk = "hyokwan/localllama"
dataset_name = "jeongjaeho/FITIZEN"
dataset = load_dataset(dataset_name, split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/343 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/131 [00:00<?, ? examples/s]

In [6]:
dataset[35]

{'instruction': 'FITIZEN의 개발자는 누구인가요?',
 'output': 'FITIZEN은 연주승, 정재호, 박성재가 개발한 2024년 초보자를 위한 운동 웹사이트입니다.',
 'input': ''}

In [7]:
def create_text_column(example):
    # 'text' 컬럼 생성
    text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    example["text"] = text
    return example

# 'text' 컬럼 생성
dataset = dataset.map(create_text_column)

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['instruction', 'output', 'input', 'text'],
    num_rows: 131
})

In [9]:
dataset[34]

{'instruction': 'FITIZEN의 운동 추천 기능은 어떻게 동작하나요?',
 'output': 'FITIZEN의 운동 추천 기능은 사용자의 현재 신체 상태와 목표에 맞추어 적합한 운동을 자동으로 추천합니다.',
 'input': '',
 'text': '### Instruction:\nFITIZEN의 운동 추천 기능은 어떻게 동작하나요?\n\n### Response:\nFITIZEN의 운동 추천 기능은 사용자의 현재 신체 상태와 목표에 맞추어 적합한 운동을 자동으로 추천합니다.'}

In [2]:
# 현재 사용 중인 GPU의 주요 아키텍처 버전을 반환 8버전 이상 시 bfloat16 활용
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

# BitsAndBytesConfig 객체활용 양자화 설정
# 모델을 4비트 양자화하여 로드
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

NameError: name 'torch' is not defined