# Instruction fine-tuning a Llama-2 model on generating Python code

## Installing and loading the libraries

In [1]:
!pip install -q transformers datasets peft accelerate bitsandbytes trl safetensors ipywidgets huggingface_hub scipy -U

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Obtaining dependency information for peft from https://files.pythonhosted.org/packages/8b/1b/aee2a330d050c493642d59ba6af51f3910cb138ea48ede228c84c204a5af/peft-0.7.1-py3-none-any.whl.metadata
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Coll

In [3]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer



## Setting Global Parameters

In [4]:
# The model that you want to train from the Hugging Face hub
model_id = "codellama/CodeLlama-7b-Instruct-hf"
# The instruction dataset to use
codecontest_name = "deepmind/code_contests"
prompt_dataset_name = "HoangLe1312/codecontest-prompt"
# Dataset split
dataset_split = "train"
# Fine-tuned model name
new_model = "llama-2-7b-int4-codeforces-20k"
# Huggingface repository
hf_model_repo = "HoangLe1312/" + new_model
# Load the entire model on the GPU 0
device_map = "auto"

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_double_nested_quant = False

################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.05

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = new_model
# Number of training epochs
num_train_epochs = 2
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False
# Batch size per GPU for training
batch_size = 128
per_device_train_batch_size = 1
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = batch_size // per_device_train_batch_size
# Enable gradient checkpointing
gradient_checkpointing = False
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 3e-4 #1e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "constant" # "constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25
# Disable tqdm
disable_tqdm = True

################################################################################
# SFTTrainer parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 512 #None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True #True #False


## Connect to Huggingface Hub

You can log in to Hugging Face Hub interactively

Or you can provide .env file containing the Hugging Face token

In [5]:
from huggingface_hub import login

# Login to the Hugging Face Hub
login('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')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load the dataset with the instruction set

In [6]:
# Load dataset from the hub
codecontest = load_dataset(codecontest_name, split=dataset_split, download_mode='force_redownload')

Downloading readme:   0%|          | 0.00/13.0k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/39 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/4.52k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/180M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/209M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/195M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/174M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/200M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/205M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/178M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/200M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/197M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/179M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/169M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/191M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/176M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/189M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/217M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/179M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/198M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/223M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/63.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/13328 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/165 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/117 [00:00<?, ? examples/s]

In [None]:
def filter_function(sample):
    return sample['source'] == 2 and \
        len(sample['public_tests']['input']) > 0 and \
        len(sample['private_tests']['input']) > 0 and \
        sample['cf_contest_id'] > 0 and \
        sample['cf_index'] != '' and \
        sample['input_file'] == '' and \
        sample['output_file'] == ''

In [7]:
codecontest = codecontest.filter(filter_function)

Filter:   0%|          | 0/13328 [00:00<?, ? examples/s]

In [8]:
descriptions = dict()
for sample in codecontest:
    contest = sample['cf_contest_id']
    index = sample['cf_index']
    description = sample['description']
    if contest not in descriptions:
        descriptions[contest] = dict()
    descriptions[contest][index] = description    

Dataset({
    features: ['name', 'description', 'public_tests', 'private_tests', 'generated_tests', 'source', 'difficulty', 'solutions', 'incorrect_solutions', 'cf_contest_id', 'cf_index', 'cf_points', 'cf_rating', 'cf_tags', 'is_description_translated', 'untranslated_description', 'time_limit', 'memory_limit_bytes', 'input_file', 'output_file'],
    num_rows: 4135
})

To fine-tune our model, we need to convert our structured examples into a collection of tasks described via instructions. We define a formatting_function that takes a sample and returns a string with our instruction format.

In [29]:
def format_instruction(sample):
    return f"""### Instruction:
You are a contestant in a programming contest. You have to solve the following problem in the contest.

### Description:
{descriptions[sample['contest']][sample['index']]}

{sample['prompt']}
"""

In [None]:
del codecontest
import gc
gc.collect()

In [None]:
dataset = load_dataset(prompt_dataset_name, split=dataset_split, download_mode='force_redownload')
dataset = dataset.select(range(1000))

## Instruction fine-tune a Llama 2 model using trl and the SFTTrainer

We will use the recently introduced method in the paper "QLoRA: Quantization-aware Low-Rank Adapter Tuning for Language Generation" by Tim Dettmers et al. QLoRA is a new technique to reduce the memory footprint of large language models during finetuning, without sacrificing performance.

Quantize the pre-trained model to 4 bits and freeze it.
Attach small, trainable adapter layers. (LoRA)
Finetune only the adapter layers while using the frozen quantized model for context.

In [10]:
# Get the type
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_double_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype
)

In [11]:
from transformers import LlamaForCausalLM, CodeLlamaTokenizer
from peft import (
    LoraConfig,
    get_peft_model_state_dict,
)
import sys

tokenizer = CodeLlamaTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

model = LlamaForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map=device_map, load_in_8bit=True, trust_remote_code=True)
model.config.pretraining_tp = 1
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The SFTTrainer supports a native integration with peft, which makes it super easy to efficiently instruction tune LLMs. We only need to create our LoRAConfig and provide it to the trainer.

In [12]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
        ],
)

# Not necessary when using SFTTrainer
# prepare model for training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)

Before we can start our training we need to define the hyperparameters (TrainingArguments) we want to use

In [13]:
# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # 6 if use_flash_attention else 4,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    #save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    #max_steps=max_steps,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    disable_tqdm=disable_tqdm,
    report_to="tensorboard",
    seed=42
)

We now have every building block we need to create our SFTTrainer to start then training our model.

In [30]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    formatting_func=format_instruction,
    args=args,
)

Start training our model by calling the train() method on our Trainer instance.

In [31]:
import gc
gc.collect()
# train
trainer.train() # there will not be a progress bar since tqdm is disabled

KeyboardInterrupt: 

In [None]:
# save model in local
gc.collect()
trainer.save_model()

## Merge the model and the adapters and save it

When running in a T4 instance we have to clean the memory

In [None]:
# Empty VRAM
del model
del trainer
del dataset
del bnb_config
del peft_config
del compute_dtype

import gc
gc.collect()

In [None]:
torch.cuda.empty_cache() # PyTorch thing

In [None]:
gc.collect()

Reload the trained and saved model and merge it then we can save the whole model

In [None]:
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Merge LoRA and base model
gc.collect()
merged_model = new_model.merge_and_unload()


In [None]:
# Save the merged model
gc.collect()
merged_model.save_pretrained("merged_model",safe_serialization=True)

In [None]:
gc.collect()
tokenizer.save_pretrained("merged_model")

In [None]:
# push merged model to the hub
from huggingface_hub import login
# Login to the Hugging Face Hub
login('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')

gc.collect()
merged_model.push_to_hub(hf_model_repo)
gc.collect()
tokenizer.push_to_hub(hf_model_repo)
gc.collect()

## Test the merged model

It is time to check our model performance

## Load the model from the HF Hub and test it

Finally we download the created model from the hub and test it to make sure it works fine!