In [1]:
# # python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
# !pip install ninja packaging
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [2]:
!pip install accelerate -U
!pip install transformers -U
!pip install peft datasets trl bitsandbytes wandb



# SETUP config, model

In [3]:
# setup libs
import torch
from transformers import AutoTokenizer, \
    AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
# from transformers.utils import logging
import os
from peft import prepare_model_for_kbit_training, \
    LoraConfig, get_peft_config, get_peft_model_state_dict, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from functools import *

#Setup function
1. QLoRA config
2. PEFT config
3. PEFT model
4. pre-trained model (model, tokenizer)
5. training params
6. SFTTrainer
7. Alternative Trainer (transformers)
8. Dataset

In [4]:
# set up QLoRA config
def setup_4_bit_quant_config(params):
    params['bnb_4bit_compute_dtype'] = torch.float16
    config = BitsAndBytesConfig(
        load_in_4bit=params['load_in_4bit'],
        bnb_4bit_quant_type=params['bnb_4bit_quant_type'],
        bnb_4bit_compute_dtype=params['bnb_4bit_compute_dtype'],
        bnb_4bit_use_double_quant=params['bnb_4bit_use_double_quant']
    )
    return config

In [5]:
# peft config
def setup_peft_config(params):
    peft_config = LoraConfig(
        lora_alpha=params['alpha'],
        lora_dropout=params['lora_dropout'],
        r=params['peft_r'],
        bias=params['peft_bias'],
        task_type=params['task_type'],
        # set up inference mode
        inference_mode=False
    )
    return peft_config

In [6]:
# PEFT model
def setup_peft_model(model, peft_config):
    """
    :param model: taking pre-trained model
    :param peft_config: defined PEFT config
    :return: PEFT model
    """
    model = get_peft_model(model, peft_config); # getting peft model
    # model.print_trainable_parameters() # trainable params
    return model

In [7]:
def setup_pretrained_model(model_name, bnb_config):
    """
    :param model_name:
    :param cache_dir: Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              trust_remote_code=True,
                                              torch_dtype=torch.float16,)  # tokenizer
    # if tokenizer.pad_token is None:
        # tokenizer.add_special_token({'pad_token': '[PAD]'})
    tokenizer.pad_token = tokenizer.eos_token # replace pad with eos token
    # tokenizer.add_eos_token = True

    # config use_cache: False -> don't use old params
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                 use_cache=False,
                                                 torch_dtype=torch.float16,
                                                 load_in_4bit=True,
                                                 load_in_8bit=False,
                                                 quantization_config=bnb_config,
                                                 trust_remote_code=True)
    """ getting model for kbit quantization
    Casts all the non kbit modules to full precision(fp32) for stability
    Adds a forward hook to the input embedding layer to calculate the
    gradients of the input hidden states
    Enables gradient checkpointing for more memory-efficient training
    """
    # logging.info("model loaded in type", getattr(model, "is_loaded_in_4bit")) # logging info
    # print(f"Load in 4bit: {getattr(model, "is_loaded_in_4bit")}")
    model.config.use_cache = False # avoid caching params
    model.gradient_checkpointing_enable() # enable grad check point for not memorize the length chain
    model = prepare_model_for_kbit_training(model) #
    return model, tokenizer

In [8]:
def setup_training_params(params):
    """
    :param params: defined params
    :return: Training argurments transformers
    """
    params['learning_rate'] = 2e-4
    train_params = TrainingArguments(
        output_dir=params["output_dir"],
        num_train_epochs=params["epochs"],
        per_device_train_batch_size=params["per_device_train_batch_size"],
        gradient_accumulation_steps=params["gradient_accumulation_steps"],
        optim=params["optim"],
        save_steps=params["save_steps"],
        logging_steps=params["logging_steps"],
        learning_rate=params['learning_rate'],
        fp16=params['fp16'],
        bf16=params['bf16'],
        max_grad_norm=params["max_grad_norm"],
        max_steps=params["max_steps"],
        warmup_ratio=params["warmup_ratio"],
        group_by_length=params["group_by_length"],
        lr_scheduler_type=params["lr_scheduler_type"],
        # report_to="wandb" if params["use_wandb"] else None,
        # run_name=params["wandb_run_name"] if params["use_wandb"] else None,
    )
    return train_params

In [9]:
def setup_trainer(model, tokenizer, train_dataset, eval_dataset, peft_config, max_len, train_args):
    """
    :param model: LLMs
    :param tokenizer: LLMs tokenizer
    :param dataset:
    :param peft_config:
    :param max_len:
    :param train_args:
    :return: SFT trainer
    """
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        max_seq_length=max_len,
        args=train_args,
        dataset_batch_size=32
    )
    return trainer

In [10]:
# Transformers Trainer
def setup_transformers_trainer(model, train_data, args, collator):
    """
    :param model: PEFT model
    :param train_data: train set
    :param eval_data: dev set
    :param args: training args
    :param collator: data colllator
    :return: transformer Trainer class
    """
    trainer = Trainer(model=model, train_dataset=train_data, args=args,
                      data_collator=collator)
    return trainer

In [11]:
def training_dataset(dataset_url: str = None):
    """
    :param dataset_url: json file
    :return: set of data
    """
    datasets = load_dataset("json",data_files=dataset_url)
    return datasets

# Design prompt template

In [12]:
def generate_prompt(point):
  """
  :param point(data point) passing through data collator
  dataset attr (instruction, input, output)
  """
  return f"""
  Bạn là trợ lý AI hữu ích. Hãy trả lời câu hỏi của người dùng một cách có logic nhất
  dưới đây ### Instruction: {point['instruction']} là sự hướng dẫn hoặc cũng có thể là input của người dùng
  hãy dựa vào đây để trả lời câu hỏi
  ### Input: {point['input']} cũng có thể là input của người dùng (ở đây có thể có hoặc không)
  ### Output: {point['output']} sẽ là kết quả của câu hỏi
  """

# Generate prompt from data point and tokenize them

In [13]:
# generate and tokenize prompt
def gen_tokenize(point, tokenizer):
  prompt = generate_prompt(point) # generate prompt based on data point

  # tokenize using defined tokenizer
  tokenized_prompt = tokenizer(prompt, padding=True, truncation=True)

  return tokenized_prompt

# Setup configs
Including Quantization, PEFT, Train Argurments, dataset

In [14]:
# setup params
import yaml
params = yaml.safe_load(open('config.yml', 'r', encoding='utf8'))

In [15]:
links='https://raw.githubusercontent.com/VietnamAIHub/Vietnamese_LLMs/main/Generate_and_Translate_Dataset/Vietnamese_Instructions_datasets/Translation/Alpaca_52k/GPT_35_results/alpaca_translate_GPT_35_10_20k.json'

In [16]:
# setup
"""
Quantization
PEFT
Train argurments
pretrianed model
"""
quant_configs = setup_4_bit_quant_config(params)
peft_config = setup_peft_config(params)
train_args = setup_training_params(params)
dataset = training_dataset(dataset_url=links)

In [17]:
# dataset['train'] = dataset['train'].shuffle(seed=42).select([i for i in range(1000)])
# len(dataset['train'])

In [18]:
dataset['train']

Dataset({
    features: ['instruction', 'output', 'input'],
    num_rows: 9941
})

In [19]:
# setup model
model, tokenizer = setup_pretrained_model(model_name=params['base_model'],
                                          bnb_config=quant_configs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [20]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.padding_size = 'right'

In [21]:
print("Number of parameters %d" % sum([param.nelement() for param in model.parameters()]))

Number of parameters 3500412928


In [22]:
from peft import get_peft_model, get_peft_model_state_dict
new_model = get_peft_model(model, peft_config)

In [23]:
new_model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035


# Split dataset eval and train

In [24]:
set = dataset['train'].train_test_split(test_size=0.3, seed=42)

# Train and eval dataset

In [25]:
set_train = set['train'].shard(num_shards=50, index=0).map(lambda sample: gen_tokenize(point=sample, tokenizer=tokenizer))
            # .filter(lambda sample: sample['instruction'] != '' and sample['input'] != '' and sample['output'] != '') \
            # .shuffle()

In [26]:
set_train

Dataset({
    features: ['instruction', 'output', 'input', 'input_ids', 'attention_mask'],
    num_rows: 140
})

# Setup trainer with Trainer(transformers)

In [27]:
# from trl import DataCollatorForCompletionOnlyLM, DataCollatorForLanguageModeling
# # instruction_template = "### Human:"
# # response_template = "### Assistant:"
# # collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

In [28]:
from transformers import DataCollatorForLanguageModeling
# Transformer Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = setup_transformers_trainer(model=new_model, train_data=set_train,
                                     args=train_args,
                                     collator=data_collator)

# Train

In [29]:
# import gc
# import torch
# gc.collect()
# torch.cuda.empty_cache()
# torch.cuda.memory_summary(device=None, abbreviated=False)

In [30]:
print(f"Number of parameters in the modified model: {sum(p.numel() for p in new_model.parameters())}")

Number of parameters in the modified model: 3533967360


In [31]:
# !pip install bitsandbytes

In [32]:
# import torch
# model = torch.compile(model)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mcunho2803032003[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.2109
20,0.8609
30,0.6448
40,0.5483
50,0.5058
60,0.4763
70,0.4278
80,0.4408
90,0.4324
100,0.4471




TrainOutput(global_step=100, training_loss=0.5995104360580444, metrics={'train_runtime': 1274.5894, 'train_samples_per_second': 0.314, 'train_steps_per_second': 0.078, 'total_flos': 9621904209174528.0, 'train_loss': 0.5995104360580444, 'epoch': 2.86})