# Fine Tunning LLMs Lab

Just some quick lab for fine-tunning small models and estimating cost/quality. This lab assumes that you have "miniforge" installed and you're running a `ipykernel` from inside a `conda/mamba` environment.


In [1]:
%pip install accelerate transformers datasets evaluate peft trl flash-attention
%pip install -i https://pypi.org/simple/ bitsandbytes
%pip install flash-attn --no-build-isolation

Collecting flash-attention
  Using cached flash_attention-1.0.0-py3-none-any.whl.metadata (274 bytes)
Downloading flash_attention-1.0.0-py3-none-any.whl (31 kB)
Installing collected packages: flash-attention
Successfully installed flash-attention-1.0.0
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple/
Note: you may need to restart the kernel to use updated packages.
Collecting flash-attn
  Using cached flash_attn-2.5.8.tar.gz (2.5 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[24 lines of output][0m
  [31m   [0m No ROCm runtime is found, using ROCM_HOME='/usr'
  [31m   [0m fatal: not a git repository (or any parent up to mount point /)
  [31m   [0m Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
  [

## Defining hyper-parameters


In [1]:
from transformers import TrainingArguments
from peft import LoraConfig, TaskType

train_conf = TrainingArguments(**{
    "do_eval": True,

    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    
    "gradient_accumulation_steps": 1,
    "learning_rate": 5.0e-06,
        
    "bf16": True,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "warmup_ratio": 0.2,
})

peft_conf = LoraConfig(**{
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": TaskType.CAUSAL_LM,
    "target_modules": "all-linear",
    "modules_to_save": None,
})

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


## Setting up logging driver


In [2]:
import sys
import logging
import datasets
import transformers

logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

2024-05-03 10:54:37 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1

## Model loading


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model

max_seq_length = 2048

# checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"

model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    max_seq_length=max_seq_length,
    # device_map='auto'

    load_in_8bit=True,
    attn_implementation="flash_attention_2"
)

model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

tokenizer.model_max_length = max_seq_length
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

Checkout the model trainable parameters


In [5]:
base_model = get_peft_model(model, peft_conf)
base_model.print_trainable_parameters()

trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6542958532243159


## Data processing


In [28]:
from datasets import load_dataset

code_dataset = load_dataset('flytech/python-codes-25k', split='train')
code_dataset = code_dataset.train_test_split(
    test_size=0.2,
    seed=1337
)

train_dataset = code_dataset["train"]
test_dataset = code_dataset["test"]

column_names_to_remove = ["text"]

Overwrite dataset info from restored data version if exists.


2024-05-03 10:17:17 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.


Loading Dataset info from /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d


2024-05-03 10:17:17 - INFO - datasets.info - Loading Dataset info from /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d


Found cached dataset python-codes-25k (/home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d)


2024-05-03 10:17:17 - INFO - datasets.builder - Found cached dataset python-codes-25k (/home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d)


Loading Dataset info from /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d


2024-05-03 10:17:17 - INFO - datasets.info - Loading Dataset info from /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d


Loading cached split indices for dataset at /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d/cache-7e789adf0584c1fd.arrow and /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d/cache-3fe3dacd16827ea8.arrow


2024-05-03 10:17:17 - INFO - datasets.arrow_dataset - Loading cached split indices for dataset at /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d/cache-7e789adf0584c1fd.arrow and /home/monkey/.cache/huggingface/datasets/flytech___python-codes-25k/default/0.0.0/8101c0aff27202cd66472ea2234634218bcfac2d/cache-3fe3dacd16827ea8.arrow


In [7]:
%%time

def apply_chat_template(
    example,
    tokenizer,
):
    full_output = example["input"] + "\n" + example["output"] if "output" in example else example["input"]
    
    example["output"] = full_output

    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": example["instruction"]},
        {"role": "assistant", "content": full_output}
    ]

    example["messages"] = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    return example

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names_to_remove,
    desc="Applying chat template to train_dataset",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names_to_remove,
    desc="Applying chat template to test_dataset",
)

NameError: name 'train_dataset' is not defined

In [36]:
import json
from IPython.display import display_json

display_json(json.dumps(processed_train_dataset[0], indent=4), raw=True)

## Evaluation


In [45]:
import evaluate
from evaluate import evaluator

code_eval_metric = evaluate.load("code_eval")
task_evaluator = evaluator(
  task="text-generation",
)

initial_eval_results = task_evaluator.compute(
  model_or_pipeline=model,
  tokenizer=tokenizer,
  data=test_dataset,
  input_column="instruction",
  label_column="output",
  metric=code_eval_metric,
)

initial_eval_results



ValueError: Input length of input_ids is 48, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

## Training


In [7]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=train_conf,
    peft_config=peft_conf,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
    max_seq_length=2048,
    dataset_text_field="messages",
    tokenizer=tokenizer,
    compute_metrics=task_evaluator,
    packing=True
)

train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

NameError: name 'processed_train_dataset' is not defined

## Training stats


In [40]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

## Evaluation


In [None]:
tokenizer.padding_side = 'left'
metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

## Save the model


In [None]:
trainer.save_model(train_conf.output_dir)