In [None]:
  # # python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
# !pip install ninja packaging
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [None]:
!pip install accelerate -U
!pip install transformers -U
!pip install peft datasets trl bitsandbytes wandb

# SETUP config, model

In [None]:
# setup libs
import torch
from transformers import AutoTokenizer, \
    AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
# from transformers.utils import logging
import os
from peft import prepare_model_for_kbit_training, \
    LoraConfig, get_peft_config, get_peft_model_state_dict, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from functools import *

#Setup function
1. QLoRA config
2. PEFT config
3. PEFT model
4. pre-trained model (model, tokenizer)
5. training params
6. SFTTrainer
7. Alternative Trainer (transformers)
8. Dataset

In [None]:
# set up QLoRA config
def setup_4_bit_quant_config(params):
    params['bnb_4bit_compute_dtype'] = torch.float16
    config = BitsAndBytesConfig(
        load_in_4bit=params['load_in_4bit'],
        bnb_4bit_quant_type=params['bnb_4bit_quant_type'],
        bnb_4bit_compute_dtype=params['bnb_4bit_compute_dtype'],
        bnb_4bit_use_double_quant=params['bnb_4bit_use_double_quant']
    )
    return config

In [None]:
# peft config
def setup_peft_config(params):
    peft_config = LoraConfig(
        lora_alpha=params['alpha'],
        lora_dropout=params['lora_dropout'],
        r=params['peft_r'],
        bias=params['peft_bias'],
        task_type=params['task_type'],
        # set up inference mode
        inference_mode=False
    )
    return peft_config

In [None]:
# PEFT model
def setup_peft_model(model, peft_config):
    """
    :param model: taking pre-trained model
    :param peft_config: defined PEFT config
    :return: PEFT model
    """
    model = get_peft_model(model, peft_config); # getting peft model
    # model.print_trainable_parameters() # trainable params
    return model

In [None]:
def setup_pretrained_model(model_name, bnb_config):
    """
    :param model_name:
    :param cache_dir: Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True,
                                              torch_dtype=torch.float16,)  # tokenizer
    # if tokenizer.pad_token is None:
        # tokenizer.add_special_token({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token # replace pad with eos token
    # tokenizer.add_eos_token = True

    # config use_cache: False -> don't use old params
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 use_cache=False,
                                                 torch_dtype=torch.float16,
                                                 load_in_4bit=True,
                                                 load_in_8bit=False,
                                                 quantization_config=bnb_config,
                                                 trust_remote_code=True)
    """ getting model for kbit quantization
    Casts all the non kbit modules to full precision(fp32) for stability
    Adds a forward hook to the input embedding layer to calculate the
    gradients of the input hidden states
    Enables gradient checkpointing for more memory-efficient training
    """
    # logging.info("model loaded in type", getattr(model, "is_loaded_in_4bit")) # logging info
    # print(f"Load in 4bit: {getattr(model, "is_loaded_in_4bit")}")
    model.config.use_cache = False # avoid caching params
    model.gradient_checkpointing_enable() # enable grad check point for not memorize the length chain
    model = prepare_model_for_kbit_training(model) #
    return model, tokenizer

In [None]:
def setup_training_params(params):
    """
    :param params: defined params
    :return: Training argurments transformers
    """
    params['learning_rate'] = 2e-4
    train_params = TrainingArguments(
        output_dir=params["output_dir"],
        num_train_epochs=params["epochs"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        optim=params["optim"],
        save_steps=params["save_steps"],
        logging_steps=params["logging_steps"],
        learning_rate=params['learning_rate'],
        fp16=params['fp16'],
        bf16=params['bf16'],
        max_grad_norm=params["max_grad_norm"],
        max_steps=params["max_steps"],
        warmup_ratio=params["warmup_ratio"],
        group_by_length=params["group_by_length"],
        lr_scheduler_type=params["lr_scheduler_type"],
        # report_to="wandb" if params["use_wandb"] else None,
        # run_name=params["wandb_run_name"] if params["use_wandb"] else None,
    )
    return train_params

In [None]:
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, peft_config, max_len, train_args):
    """
    :param model: LLMs
    :param tokenizer: LLMs tokenizer
    :param dataset:
    :param peft_config:
    :param max_len:
    :param train_args:
    :return: SFT trainer
    """
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=max_len,
        args=train_args,
        dataset_batch_size=32,
        packing=True
    )
    return trainer

In [None]:
# Transformers Trainer
def setup_transformers_trainer(model, train_data, args, collator):
    """
    :param model: PEFT model
    :param train_data: train set
    :param eval_data: dev set
    :param args: training args
    :param collator: data colllator
    :return: transformer Trainer class
    """
    trainer = Trainer(model=model, train_dataset=train_data, args=args,
                      data_collator=collator)
    return trainer

In [None]:
def training_dataset(dataset_url: str = None):
    """
    :param dataset_url: json file
    :return: set of data
    """
    datasets = load_dataset("json",data_files=dataset_url)
    return datasets

# Design prompt template

In [None]:
def generate_prompt(point):
  """
  :param point(data point) passing through data collator
  dataset attr (instruction, input, output)
  """
  return f"""
  Bạn là trợ lý AI hữu ích. Hãy trả lời câu hỏi của người dùng một cách có logic nhất
  dưới đây ### Instruction: {point['instruction']} là sự hướng dẫn hoặc cũng có thể là input của người dùng
  hãy dựa vào đây để trả lời câu hỏi
  ### Input: {point['input']} cũng có thể là input của người dùng (ở đây có thể có hoặc không)
  ### Output: {point['output']} sẽ là kết quả của câu hỏi
  """

# Generate prompt from data point and tokenize them

In [None]:
# generate and tokenize prompt
def gen_tokenize(point, tokenizer: AutoTokenizer):
  prompt = generate_prompt(point) # generate prompt based on data point

  # tokenize using defined tokenizer
  tokenized_prompt = tokenizer(prompt, padding=True, truncation=True)

  return tokenized_prompt

# Setup configs
Including Quantization, PEFT, Train Argurments, dataset

In [None]:
# setup params
import yaml
params = yaml.safe_load(open('config.yml', 'r', encoding='utf8'))

In [None]:
links='https://raw.githubusercontent.com/VietnamAIHub/Vietnamese_LLMs/main/Generate_and_Translate_Dataset/Vietnamese_Instructions_datasets/Translation/Alpaca_52k/GPT_35_results/alpaca_translate_GPT_35_10_20k.json'

In [None]:
# setup
"""
Quantization
PEFT
Train argurments
pretrianed model
"""
quant_configs = setup_4_bit_quant_config(params)
peft_config = setup_peft_config(params)
train_args = setup_training_params(params)
dataset = training_dataset(dataset_url=links)

Downloading data:   0%|          | 0.00/2.49M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset['train']

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 9941
})

In [None]:
# setup model
model, tokenizer = setup_pretrained_model(model_name=params['base_model'],
                                          bnb_config=quant_configs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.padding_size = 'right'

In [None]:
print("Number of parameters %d" % sum([param.nelement() for param in model.parameters()]))

Number of parameters 3500412928


In [None]:
from peft import get_peft_model, get_peft_model_state_dict
new_model = get_peft_model(model, peft_config)

In [None]:
new_model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035


# Split dataset eval and train

In [None]:
set = dataset['train'].train_test_split(test_size=0.3, seed=42)

In [None]:
set

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 6958
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 2983
    })
})

# Train and eval dataset

In [None]:
set_train = set['train'].map(lambda sample: gen_tokenize(point=sample, tokenizer=tokenizer))

Map:   0%|          | 0/6958 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
set_val = set['test'].map(lambda x: gen_tokenize(point=x, tokenizer=tokenizer))

Map:   0%|          | 0/2983 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
set_val

Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 2983
})

In [None]:
set_train

Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 6958
})

# Setup trainer with Trainer(transformers)

In [None]:
from transformers import DataCollatorForLanguageModeling
# Transformer Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = setup_transformers_trainer(model=new_model, train_data=set_train,
                                     args=train_args,
                                     collator=data_collator)

# Setup trainer with TRL

In [None]:
# self_trainer = setup_trainer(model=new_model, tokenizer=tokenizer, train_dataset=set_train,
#                              eval_dataset=set_val, peft_config=peft_config, max_len=500, train_args=train_args)

# Train

In [None]:
print(f"Number of parameters in the modified model: {sum(p.numel() for p in new_model.parameters())}")

Number of parameters in the modified model: 3533967360


In [None]:
# import torch
# model = torch.compile(model)
trainer.train() # 88703a109484dd7073697846d45f19591fb61036
print('done')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss
10,1.0541
20,0.8728
30,0.6742
40,0.5148
50,0.4018
60,0.6278
70,0.6221
80,0.5434
90,0.4437
100,0.3166




done


In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!huggingface-cli whoami

nguyenanh2803


In [None]:
new_model.push_to_hub('nguyenanh2803/llama2-finetuned-qlora', use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nguyenanh2803/llama2-finetuned-qlora/commit/72f8ecb18a2f246649f7c3d14dc20c440e657df8', commit_message='Upload model', commit_description='', oid='72f8ecb18a2f246649f7c3d14dc20c440e657df8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
peft_model_id = 'nguyenanh2803/llama2-finetuned-qlora'
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_4bit=True, device_map='auto')
# tokenizer of base model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_model = PeftModel.from_pretrained(model, peft_model_id) # peft saved model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
input = "xin chào, dịch sang Tiếng Việt đi"
text = tokenizer(input, return_tensors='pt')
text

{'input_ids': tensor([[    1,   921,   262,   521, 30001, 29877, 29892,   270, 30740,   305,
         13625, 18439, 30717,   865, 10630, 30529, 29873, 29871, 30128, 29875]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
output_tokens = peft_model.generate(**text, max_new_tokens=50)

In [None]:
print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 xin chào, dịch sang Tiếng Việt điện tử
 Unterscheidung zwischen "xin chào" und "chào bạn" ist wichtig, da es je nach Kontext und Situation unterschiedliche Ausdrücke für die Begrüßung gibt
