<a href="https://colab.research.google.com/github/JavierMedel/Advance-LLM-Fine-Tuning/blob/main/Advance-Fine-Tuning%20(PEFT%2C%20SFT%2C%20LoRa).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install huggingface-hub==0.26.2 transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4



In [2]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [4]:
repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(repo_id, device_map='cuda:0', quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [6]:
print(model.get_memory_footprint()/1000000)

2206.347264


In [8]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3SdpaAttention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): Phi3RMSNorm((3072

In [9]:
model = prepare_model_for_kbit_training(model)  # prepare the model for training

config = LoraConfig(
    r=8, # The new matrix size of the parameters that will be train.
    lora_alpha=16, # two times r = 2*r
    bias='none',
    lora_dropout=0.05, # do not use some neurons
    task_type='CAUSAL_LM',
    # We need to specify which model within the model we want to train
    # This is the fine tuning
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj']
)

model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magn

In [10]:
print(model.get_memory_footprint()/1e6)

2651.080704


In [12]:
train_p, tot_p = model.get_nb_trainable_parameters()

print(f'Trainable parameters:        {train_p/1e6:.2f}M')
print(f'Total parameters:            {tot_p/1e6:.2f}M')
print(f'% of trainable parameters:   {100*train_p/tot_p:.2f}%')

Trainable parameters:        12.58M
Total parameters:            3833.66M
% of trainable parameters:   0.33%


In [13]:
#dataset = load_dataset("medalpaca/medical_meadow_wikidoc", split='train')
dataset = load_dataset("dvgodoy/yoda_sentences", split='train')
dataset

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentences.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/720 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [14]:
# Rename and remove columns to fit the model structure

#dataset = dataset.rename_column("input", "prompt")
#dataset = dataset.rename_column("output", "completion")
#dataset = dataset.remove_columns(["instruction"])
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [16]:
dataset[0]

{'prompt': 'The birch canoe slid on the smooth planks.',
 'completion': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [17]:
def format_dataset(examples):
  converted_sample = [
      {"role": "user", "content": examples["prompt"]},
      {"role": "assistant", "content": examples["completion"]},
  ]
  return {'messages': converted_sample}

In [18]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])
dataset[0]['messages']

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

[{'content': 'The birch canoe slid on the smooth planks.', 'role': 'user'},
 {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
  'role': 'assistant'}]

In [19]:
#split = dataset.train_test_split(test_size=0.9)
train = dataset

In [20]:
# The model will automatically go and get the tokenizer for this model

tokenizer = AutoTokenizer.from_pretrained(repo_id)
print(tokenizer.chat_template)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'user' %}{{'<|user|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
' + message['content'] + '<|end|>
'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
' }}{% else %}{{ eos_token }}{% endif %}


In [21]:
print(tokenizer.apply_chat_template(conversation=train['messages'][32], tokenize=False))

<|user|>
Note closely the size of the gas tank.<|end|>
<|assistant|>
Closely, note the size of the gas tank, you must.<|end|>
<|endoftext|>


In [22]:
tokenizer.pad_token = tokenizer.unk_token   # fix huggingface bug
tokenizer.pad_token_id = tokenizer.unk_token_id

In [24]:
sft = SFTConfig(
    gradient_checkpointing=True, # This help us to save memory
    gradient_checkpointing_kwargs={"use_reentrant": False}, # A new argument for the new version of PyTorch https://github.com/huggingface/transformers/issues/28536
    # Gradient accumulation and batch size, when we want the backpropagation runs.
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    auto_find_batch_size=True, # If the batch size you are using can cause an OOM (Out of Memory) error, we divide it by 2 until it works.

    #max_seq_length=768, # The size of our context window.
    max_seq_length=64,
    packing=True,  # Package everything to make it easy to deploy

    num_train_epochs=10,
    learning_rate=3e-4, # get out the local minimun

    optim='paged_adamw_8bit',

    logging_steps=10, # update the log window
    logging_dir='./logs',
    output_dir='./phi3-mini-med-adapter',
    report_to='none'
)

In [25]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft,
    #train_dataset=split['train']
    train_dataset=train
)

Generating train split: 0 examples [00:00, ? examples/s]



In [26]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [28]:
len(batch['input_ids'][0]), len(batch['labels'][0]) #input and output of the model

(64, 64)

In [29]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,3.0405
20,1.7835
30,1.5033
40,1.4502
50,1.3425
60,1.2232
70,1.1598
80,0.9279
90,0.8496
100,0.6042


TrainOutput(global_step=220, training_loss=0.8140467784621499, metrics={'train_runtime': 1629.4388, 'train_samples_per_second': 2.154, 'train_steps_per_second': 0.135, 'total_flos': 5034400555991040.0, 'train_loss': 0.8140467784621499, 'epoch': 10.0})

In [30]:
def encode_prompt(tokenizer, sentence):
  sample = [{'role': 'user', 'content': sentence}]
  prompt = tokenizer.apply_chat_template(conversation=sample, tokenize=False, add_generation_prompt=True)
  return prompt

In [31]:
#sentence = 'What are the historical background and symptoms of Candida-induced vulvovaginitis?'

sentence = 'The Force is strong in you!'
prompt = encode_prompt(tokenizer, sentence)
print(prompt)

<|user|>
The Force is strong in you!<|end|>
<|assistant|>



In [32]:
def inference(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
  tokenized_input = tokenizer(
      prompt, return_tensors='pt', add_special_tokens=False
  ).to(model.device) # model the model the GPU

  model.eval()

  gen_output = model.generate(**tokenized_input,
                              eos_token_id=tokenizer.eos_token_id,  # what is the id that shows the end of the prompt
                              max_new_tokens=max_new_tokens)

  output = tokenizer.batch_decode(gen_output, skip_special_tokens=skip_special_tokens) # we go the token, we need to use the tokenizer to decode the output
  return output[0]

In [33]:
print(inference(model, tokenizer, prompt))

<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the Force is.<|end|><|endoftext|>


In [34]:
trainer.save_model('local-phi3-mini-yoda-adapter')

In [35]:
!pip install huggingface_hub



In [36]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/50.4M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.97k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jmedel/phi3-mini-med-adapter/commit/7e39f3f91e983222891e4ab8f58da5f4f0b79813', commit_message='End of training', commit_description='', oid='7e39f3f91e983222891e4ab8f58da5f4f0b79813', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jmedel/phi3-mini-med-adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='jmedel/phi3-mini-med-adapter'), pr_revision=None, pr_num=None)