# 1. Import thư viện sẽ dùng

In [1]:
import torch
import nltk
from pprint import pprint
from tqdm import tqdm
import pretty_errors
from IPython.display import display, Markdown

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from datasets import load_dataset
import transformers
from transformers import (
    GemmaForCausalLM, # Model loader
    GemmaTokenizer, # tokenizer loader
    GemmaConfig, # Model config loader
    BitsAndBytesConfig, # BitsAndBytes model loader
    TrainingArguments, # Tham số huấn luyện
    logging, # Kiểm soát log
)
from peft import LoraConfig, get_peft_model # Parameter Efficient Fine-Tuning
from trl import SFTTrainer # Hàm hỗ trợ quá trình huấn luyện (Supervised Fine-Tuning)

logging.get_logger().setLevel(logging.ERROR)
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
!nvidia-smi

Sat Apr  6 10:13:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.76                 Driver Version: 551.76         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090      WDDM  |   00000000:05:00.0  On |                  N/A |
| 99%   27C    P5             57W /  390W |     153MiB /  24576MiB |     30%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# II. Load dữ liệu

In [5]:
dataset = load_dataset("microsoft/orca-math-word-problems-200k", split = "train")
print(dataset)

for i in range(5):
    print(dataset[i])

Dataset({
    features: ['question', 'answer'],
    num_rows: 200035
})
{'question': 'Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook.', 'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line faster than him.'}
{'question': 'A number divided by 10 is 6. Yoongi got the result by subtracting 15 from a certain number. What is the result he got?', 'answer': 'Let\'s call the certain number "x". According to the information given:\n\nA number divided by 10 is 6:\nx / 10 = 6\n\nYoongi got the result by subtracting 15 from x:\nResult = x - 15\n\nFirst, we need to find the value of x. We can do this by solving the first equation:\n\nx / 10 = 6\nx = 6 * 10\nx = 60\n\nNow that we know x is 60, we can find the result Yoongi got by subtracting 15 from x:\n\nResult = x - 15\nResult = 60 - 15\nResult = 45\n\nSo, the result Yoongi got is 45.'}
{'question': 'Dongju selects a piece of paper with a number written on it, and want

In [6]:
pprint(dataset.features)

{'answer': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None)}


In [7]:
pprint(dataset.column_names)

['question', 'answer']


In [8]:
pprint(dataset[0])

{'answer': 'If Jungkook is in 5th place, then 4 people crossed the finish line '
           'faster than him.',
 'question': 'Jungkook is the 5th place. Find the number of people who crossed '
             'the finish line faster than Jungkook.'}


In [9]:
def formatting_function(examples):
    instruction_template =  """
                                **Instruction:**
                                Please answer the following math problem.

                                **Input:**
                                {input}

                                **Response:**
                                {response}
                            """

    # instruction = examples['instruction']
    input_text = examples['question']
    response_text = examples['answer']

    formatted_data = instruction_template.format(
        # instruction = instruction,
        input = input_text,
        response = response_text
    )

    return {"prompt": formatted_data}

In [10]:
formatted_dataset = dataset.map(formatting_function)

In [11]:
formatted_dataset = formatted_dataset.train_test_split(test_size = 0.1, seed = 42)

In [12]:
formatted_dataset['train'][0]

{'question': 'Jinho spent half of his money and 300 won at the first store, then half of his money and 400 won at the second store, and he had no money left. Find how much money Jinho had at the beginning.',
 'answer': "Let's denote the amount of money Jinho had at the beginning as \\( M \\).\n\nAt the first store, he spent half of his money and an additional 300 won. So the amount he spent at the first store is \\( \\frac{M}{2} + 300 \\).\n\nAfter spending at the first store, he is left with \\( M - (\\frac{M}{2} + 300) = \\frac{M}{2} - 300 \\).\n\nAt the second store, he spent half of the remaining money and an additional 400 won. So the amount he spent at the second store is \\( \\frac{(\\frac{M}{2} - 300)}{2} + 400 \\).\n\nAfter spending at the second store, he has no money left, so the amount he spent at the second store is equal to the amount he had after the first store, which is \\( \\frac{M}{2} - 300 \\).\n\nTherefore, we have the equation:\n\n\\[ \\frac{(\\frac{M}{2} - 300)}{

# III. Khởi tạo thông số fine-tune

In [13]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [14]:
model_id = "google/gemma-2b" # Mô hình

lora_config = LoraConfig(
    r = 32, # Số lượng bit cho Quantization
    lora_alpha = 12, # Số lượng bit cho Activation
    lora_dropout = 0.05, # Dropout
    fan_in_fan_out = True,
    bias = "none",
    task_type = "CAUSAL_LM"
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

base_model = GemmaForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map = "auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
tokenizer = GemmaTokenizer.from_pretrained(model_id)

In [16]:
special_tokens = {
    "pad_token": "<pad>",
    "eos_token": "<eos>",
    "bos_token": "<bos>",
    "unk_token": "<unk>",
    "sep_token": "<sep>",
    "cls_token": "<cls>"
}

tokenizer.add_tokens(list(special_tokens.values()))

2

In [17]:
print_trainable_parameters(get_peft_model(base_model, lora_config))

trainable params: 3686400 || all params: 1518954496 || trainable%: 0.24269324786935553


In [18]:
pprint(base_model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=32, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=32, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=256, bias=False)

In [19]:
trainer = SFTTrainer(
    base_model,
    train_dataset = formatted_dataset['train'],
    eval_dataset = formatted_dataset['test'],
    args = TrainingArguments(
        num_train_epochs = 3,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 20,
        max_grad_norm = 0.3,
        learning_rate = 2e-4,
        warmup_steps = 25,
        save_steps = 25,
        eval_steps = 25,
        max_steps = -1,
        warmup_ratio = 0.03,
        weight_decay = 0.001,
        fp16 = True,
        output_dir = "./Gemma-2B-finetuned-GVI",
        optim = "paged_adamw_8bit",
        logging_steps = 25,
    ),
    peft_config = lora_config,
    dataset_text_field = 'prompt',
)

Map:   0%|          | 0/20004 [00:00<?, ? examples/s]

In [20]:
%%time

tqdm(trainer.train())

{'loss': 1.1403, 'grad_norm': 0.1390424370765686, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 0.8804, 'grad_norm': 0.12615631520748138, 'learning_rate': 0.00019981466380013344, 'epoch': 0.01}
{'loss': 0.6878, 'grad_norm': 0.09305422753095627, 'learning_rate': 0.0001996293276002669, 'epoch': 0.01}
{'loss': 0.6532, 'grad_norm': 0.08976873010396957, 'learning_rate': 0.00019944399140040033, 'epoch': 0.01}
{'loss': 0.636, 'grad_norm': 0.12817758321762085, 'learning_rate': 0.00019925865520053378, 'epoch': 0.01}
{'loss': 0.6295, 'grad_norm': 0.09994305670261383, 'learning_rate': 0.00019907331900066721, 'epoch': 0.02}
{'loss': 0.6243, 'grad_norm': 0.0967579260468483, 'learning_rate': 0.00019888798280080064, 'epoch': 0.02}
{'loss': 0.6141, 'grad_norm': 0.1250607967376709, 'learning_rate': 0.0001987026466009341, 'epoch': 0.02}
{'loss': 0.6095, 'grad_norm': 0.11771272867918015, 'learning_rate': 0.00019851731040106753, 'epoch': 0.02}
{'loss': 0.6263, 'grad_norm': 0.11927448213100433, 'learning

PermissionError: [WinError 5] Access is denied: './Gemma-2B-finetuned-GVI\\tmp-checkpoint-250' -> './Gemma-2B-finetuned-GVI\\checkpoint-250'

# IV. Commit lên HuggingFace

In [None]:
# from huggingface_hub import notebook_login

# notebook_login() # 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

In [None]:
# base_model.push_to_hub("KasaiDanto/GVI")

In [None]:
# tokenizer.push_to_hub("KasaiDanto/GVI")

In [None]:
# trainer.push_to_hub("KasaiDanto/GVI")

# V. Test

In [None]:
# lora_config = LoraConfig.from_pretrained("KasaiDanto/GVI")
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit = True,
#     bnb_4bit_use_double_quant = True,
#     bnb_4bit_quant_type = "nf4",
#     bnb_4bit_compute_dtype = torch.bfloat16
# )

# tokenizer = GemmaTokenizer.from_pretrained("KasaiDanto/GVI")
# model = GemmaForCausalLM.from_pretrained(
#     lora_config.base_model_name_or_path,
#     quantization_config = bnb_config,
#     device_map= {"":0})

In [None]:
# from IPython.display import display, Markdown

# def make_inference(instruction, context = None):
#     if context:
#         prompt = f"Below is an instruction that describes a task, paired with an input that provides further context.\n\n### Instruction: \n{instruction}\n\n### Input: \n{context}\n\n### Response: \n"
#     else:
#         prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: \n{instruction}\n\n### Response: \n"
#     inputs = tokenizer(prompt, return_tensors = "pt", return_token_type_ids = False).to("cuda:0")
#     outputs = model.generate(**inputs, max_new_tokens = 50)
#     display(Markdown((tokenizer.decode(outputs[0], skip_special_tokens = True))))
#     outputs = base_model.generate(**inputs, max_new_tokens = 50)
#     print("---- NON-INSTRUCT-TUNED-MODEL ----")
#     display(Markdown((tokenizer.decode(outputs[0], skip_special_tokens = True))))

In [None]:
# make_inference('Tiền đề: Một người đàn ông đang trượt patin trước một chiếc ghế gỗ. Hãy dịch câu này sang tiếng Anh.')