In [None]:
# # python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
# !pip install ninja packaging
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [None]:
!pip install accelerate -U
!pip install transformers -U
!pip install peft datasets trl bitsandbytes wandb



# SETUP config, model

In [None]:
# setup libs
import torch
from transformers import AutoTokenizer, \
    AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
# from transformers.utils import logging
import os
from peft import prepare_model_for_kbit_training, \
    LoraConfig, get_peft_config, get_peft_model_state_dict, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from functools import *

#Setup function
1. QLoRA config
2. PEFT config
3. PEFT model
4. pre-trained model (model, tokenizer)
5. training params
6. SFTTrainer
7. Alternative Trainer (transformers)
8. Dataset

In [None]:
# set up QLoRA config
def setup_4_bit_quant_config(params):
    params['bnb_4bit_compute_dtype'] = torch.float16
    config = BitsAndBytesConfig(
        load_in_4bit=params['load_in_4bit'],
        bnb_4bit_quant_type=params['bnb_4bit_quant_type'],
        bnb_4bit_compute_dtype=params['bnb_4bit_compute_dtype'],
        bnb_4bit_use_double_quant=params['bnb_4bit_use_double_quant']
    )
    return config

In [None]:
# peft config
def setup_peft_config(params):
    peft_config = LoraConfig(
        lora_alpha=params['alpha'],
        lora_dropout=params['lora_dropout'],
        r=params['peft_r'],
        bias=params['peft_bias'],
        task_type=params['task_type'],
        # set up inference mode
        inference_mode=False
    )
    return peft_config

In [None]:
# PEFT model
def setup_peft_model(model, peft_config):
    """
    :param model: taking pre-trained model
    :param peft_config: defined PEFT config
    :return: PEFT model
    """
    model = get_peft_model(model, peft_config); # getting peft model
    # model.print_trainable_parameters() # trainable params
    return model

In [None]:
def setup_pretrained_model(model_name, bnb_config):
    """
    :param model_name:
    :param cache_dir: Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True,
                                              torch_dtype=torch.float16,)  # tokenizer
    # if tokenizer.pad_token is None:
        # tokenizer.add_special_token({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token # replace pad with eos token
    # tokenizer.add_eos_token = True

    # config use_cache: False -> don't use old params
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 use_cache=False,
                                                 torch_dtype=torch.float16,
                                                 load_in_4bit=True,
                                                 load_in_8bit=False,
                                                 quantization_config=bnb_config,
                                                 trust_remote_code=True)
    """ getting model for kbit quantization
    Casts all the non kbit modules to full precision(fp32) for stability
    Adds a forward hook to the input embedding layer to calculate the
    gradients of the input hidden states
    Enables gradient checkpointing for more memory-efficient training
    """
    # logging.info("model loaded in type", getattr(model, "is_loaded_in_4bit")) # logging info
    # print(f"Load in 4bit: {getattr(model, "is_loaded_in_4bit")}")
    model.config.use_cache = False # avoid caching params
    model.gradient_checkpointing_enable() # enable grad check point for not memorize the length chain
    model = prepare_model_for_kbit_training(model) #
    return model, tokenizer

In [None]:
def setup_training_params(params):
    """
    :param params: defined params
    :return: Training argurments transformers
    """
    params['learning_rate'] = 2e-4
    train_params = TrainingArguments(
        output_dir=params["output_dir"],
        num_train_epochs=params["epochs"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        optim=params["optim"],
        save_steps=params["save_steps"],
        logging_steps=params["logging_steps"],
        learning_rate=params['learning_rate'],
        fp16=params['fp16'],
        bf16=params['bf16'],
        max_grad_norm=params["max_grad_norm"],
        max_steps=params["max_steps"],
        warmup_ratio=params["warmup_ratio"],
        group_by_length=params["group_by_length"],
        lr_scheduler_type=params["lr_scheduler_type"],
        # report_to="wandb" if params["use_wandb"] else None,
        # run_name=params["wandb_run_name"] if params["use_wandb"] else None,
    )
    return train_params

In [None]:
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, peft_config, max_len, train_args):
    """
    :param model: LLMs
    :param tokenizer: LLMs tokenizer
    :param dataset:
    :param peft_config:
    :param max_len:
    :param train_args:
    :return: SFT trainer
    """
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=max_len,
        args=train_args,
        dataset_batch_size=32
    )
    return trainer

In [None]:
# Transformers Trainer
def setup_transformers_trainer(model, train_data, args, collator):
    """
    :param model: PEFT model
    :param train_data: train set
    :param eval_data: dev set
    :param args: training args
    :param collator: data colllator
    :return: transformer Trainer class
    """
    trainer = Trainer(model=model, train_dataset=train_data, args=args,
                      data_collator=collator)
    return trainer

In [None]:
def training_dataset(dataset_url: str = None):
    """
    :param dataset_url: json file
    :return: set of data
    """
    datasets = load_dataset("json",data_files=dataset_url)
    return datasets

# Design prompt template

In [None]:
def generate_prompt(point):
  """
  :param point(data point) passing through data collator
  dataset attr (instruction, input, output)
  """
  return f"""
  Bạn là trợ lý AI hữu ích. Hãy trả lời câu hỏi của người dùng một cách có logic nhất
  dưới đây ### Instruction: {point['instruction']} là sự hướng dẫn hoặc cũng có thể là input của người dùng
  hãy dựa vào đây để trả lời câu hỏi
  ### Input: {point['input']} cũng có thể là input của người dùng (ở đây có thể có hoặc không)
  ### Output: {point['output']} sẽ là kết quả của câu hỏi
  """

# Generate prompt from data point and tokenize them

In [None]:
# generate and tokenize prompt
def gen_tokenize(point, tokenizer: AutoTokenizer):
  prompt = generate_prompt(point) # generate prompt based on data point

  # tokenize using defined tokenizer
  tokenized_prompt = tokenizer(prompt, padding=True, truncation=True)

  return tokenized_prompt

# Setup configs
Including Quantization, PEFT, Train Argurments, dataset

In [None]:
# setup params
import yaml
params = yaml.safe_load(open('config.yml', 'r', encoding='utf8'))

In [None]:
links='https://raw.githubusercontent.com/VietnamAIHub/Vietnamese_LLMs/main/Generate_and_Translate_Dataset/Vietnamese_Instructions_datasets/Translation/Alpaca_52k/GPT_35_results/alpaca_translate_GPT_35_10_20k.json'

In [None]:
# setup
"""
Quantization
PEFT
Train argurments
pretrianed model
"""
quant_configs = setup_4_bit_quant_config(params)
peft_config = setup_peft_config(params)
train_args = setup_training_params(params)
dataset = training_dataset(dataset_url=links)

In [None]:
dataset['train']

Dataset({
    features: ['input', 'instruction', 'output'],
    num_rows: 9941
})

In [None]:
# setup model
model, tokenizer = setup_pretrained_model(model_name=params['base_model'],
                                          bnb_config=quant_configs)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.padding_size = 'right'

In [None]:
print("Number of parameters %d" % sum([param.nelement() for param in model.parameters()]))

Number of parameters 3500412928


In [None]:
from peft import get_peft_model, get_peft_model_state_dict
new_model = get_peft_model(model, peft_config)

In [None]:
new_model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035


# Split dataset eval and train

In [None]:
set = dataset['train'].train_test_split(test_size=0.3, seed=42)

In [None]:
set

DatasetDict({
    train: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 6958
    })
    test: Dataset({
        features: ['input', 'instruction', 'output'],
        num_rows: 2983
    })
})

# Train and eval dataset

In [None]:
set_train = set['train'].map(lambda sample: gen_tokenize(point=sample, tokenizer=tokenizer))

Map:   0%|          | 0/6958 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
set_train

# Setup trainer with Trainer(transformers)

In [None]:
from transformers import DataCollatorForLanguageModeling
# Transformer Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = setup_transformers_trainer(model=new_model, train_data=set_train,
                                     args=train_args,
                                     collator=data_collator)

# Train

In [None]:
print(f"Number of parameters in the modified model: {sum(p.numel() for p in new_model.parameters())}")

Number of parameters in the modified model: 3533967360


In [None]:
# import torch
# model = torch.compile(model)
trainer.train()
print('done')



Step,Training Loss
10,1.0533
20,0.8715
30,0.6747
40,0.5157
50,0.4022
60,0.6275
70,0.6218
80,0.5436
90,0.4442
100,0.3172


Checkpoint destination directory ../saved_models/checkpoint-10 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-30 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-40 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-50 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-60 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ../saved_models/checkpoint-70 already exists and is non-empty.Saving will

done
