In [None]:
!nvidia-smi

# Fine Tunning LLMs Lab

Quick lab for fine-tunning a `Phi3-mini-4k`. This notebook is using the libraries from the HF ecosystem. Furthermore I applied some optimizations to reduce the memory footprint of training.

I used a combination of unsloth + 8bit loading + QLoRa to achieve the best performance possible.


In [None]:
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps "xformers<0.0.26" transformers datasets evaluate peft trl accelerate bitsandbytes

## Defining hyper-parameters


In [None]:
import torch
from transformers import TrainingArguments

train_conf = TrainingArguments(**{
    "do_eval": False, # NOTE: Do not eval during training

    # NOTE: This used to be 4 but it was too slow, perhaps I increased it too much?
    "per_device_eval_batch_size": 32,
    "per_device_train_batch_size": 16,

    "gradient_accumulation_steps": 4,
    "learning_rate": 5.0e-06,

    "num_train_epochs": 1,
    "max_steps": -1,

    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.2,

    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",

    "save_steps": 100,
    "save_total_limit": 1,

    "seed": 1337,
    "fp16": not torch.cuda.is_bf16_supported(),
    "bf16": torch.cuda.is_bf16_supported(),
    "optim": "adamw_8bit", # NOTE: Is this needed? Should be the default

    "remove_unused_columns": True,

    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},

    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
})

## Setting up logging driver


In [None]:
import sys
import logging
import datasets
import transformers

logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

## Model loading


In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = None, # NOTE: This would automatically be set to torch.bfloat16 if supported
    load_in_4bit = True 
)

tokenizer.model_max_length = max_seq_length
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
# NOTE: Why is this set to "right" and then for evaluation we change it to "left"?
tokenizer.padding_side = 'right'

model = FastLanguageModel.get_peft_model(
    model,

    r = 16, # NOTE: Perhaps I can lower this to reduce the trainable params.
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,

    # NOTE: Perhaps I can optimize further with these settings?
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Checkout the model trainable parameters


In [None]:
model.print_trainable_parameters()

## Data processing


In [None]:
from datasets import load_dataset

code_dataset = load_dataset('flytech/python-codes-25k', split='train')
code_dataset = code_dataset.train_test_split(
    test_size=0.2,
    seed=1337
)

train_dataset = code_dataset["train"]
test_dataset = code_dataset["test"]

column_names_to_remove = ["text"]

We need to massage the dataset in order to match the desired input for the model.


In [None]:
%%time

def apply_chat_template(
    example,
    tokenizer,
):
    full_output = example["input"] + "\n" + example["output"] if "output" in example else example["input"]

    example["output"] = full_output

    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": full_output}
    ]

    example["messages"] = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    return example

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names_to_remove,
    desc="Applying chat template to train_dataset",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names_to_remove,
    desc="Applying chat template to test_dataset",
)

And this is the final shape of our dataset. Now we're ready for training.


In [None]:
import json

print(json.dumps(processed_train_dataset[0], indent=4))

## Evaluation


In [None]:
import evaluate

code_eval_metric = evaluate.load("code_eval")

## Training


In [None]:
#@title Show current memory stats
import torch

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
from trl import SFTTrainer

# NOTE: We only compute metrics before and after the model is trained.
# import numpy as np
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return code_eval_metric.compute(predictions=predictions, references=labels)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=train_conf,
    peft_config=peft_conf,

    train_dataset=processed_train_dataset,
    # eval_dataset=processed_test_dataset,
    dataset_text_field="messages",
    # compute_metrics=compute_metrics,

    max_seq_length=max_seq_length,
    dataset_num_proc = 2,

    packing=False
)

train_stats = trainer.train()
metrics = train_stats.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

## Training stats


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Evaluation


In [None]:
tokenizer.padding_side = 'left'

metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

## Save the model


In [None]:
trainer.save_model(train_conf.output_dir)