## 1. Install Necessary Packages


In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets scipy
!pip install -q trl accelerate

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


#2. Model Loading
###### We'll load the model using QLoRA quantization to reduce the usage of memory



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

###### Let's use the quantized model now

In [3]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

###### Run inference on the base model

In [4]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """

  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]



  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)


  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [5]:
result = get_completion(query="What are the things that are required to construct a house?", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> 
  
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  What are the things that are required to construct a house?
  [/INST]
  
  

  </s></s>


# 3. Load datasets for finetuning

In [6]:
from datasets import load_dataset

dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 121959
})

In [7]:
df = dataset.to_pandas()
df.head(5)

Unnamed: 0,instruction,input,output,text
0,Create a function to calculate the sum of a se...,"[1, 2, 3, 4, 5]",# Python code\ndef sum_sequence(sequence):\n ...,Below is an instruction that describes a task....
1,Develop a function that will add two strings,"str1 = ""Hello ""\nstr2 = ""world""","def add_strings(str1, str2):\n """"""This func...",Below is an instruction that describes a task....
2,Design a data structure in C++ to store inform...,,#include <map>\n#include <string>\n\nclass Gro...,Below is an instruction that describes a task....
3,Implement a sorting algorithm to sort a given ...,"[3, 1, 4, 5, 9, 0]",def bubble_sort(arr):\n n = len(arr)\n \n ...,Below is an instruction that describes a task....
4,Design a Swift application for tracking expens...,Not applicable,import UIKit\n\nclass ExpenseViewController: U...,Below is an instruction that describes a task....


###### Instruction Fintuning - Prepare the dataset under the format of "prompt" so the model can better understand :

- the function generate_prompt : take the instruction and output and generate a prompt
- shuffle the dataset
- tokenize the dataset

### Formatting the datasets
###### Now, let's format the dataset in the required Mistral-7B-Instruct-v0.1
https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 format.

In [8]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
    # Samples with additional context into.
    if data_point['input']:
        text = f"""[INST]{prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} [/INST]{data_point["output"]}"""
    # Without
    else:
        text = f"""[INST]{prefix_text} {data_point["instruction"]} [/INST]{data_point["output"]} """
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [9]:
dataset

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'prompt'],
    num_rows: 121959
})

In [10]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [11]:
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [12]:
print(test_data)

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 24392
})


#5. Apply LoRA

- Efficiency: It uses lower-rank matrices to significantly reduce trainable parameters and memory requirements. This leads to: <br>
    - Faster training times <br>
    - Lower computational costs
    - Ability to train on smaller hardware setups
- Effective adaptation: LoRA can be used to adapt pre-trained LLMs to specific tasks or domains without requiring complete retraining, which is a time-consuming and resource-intensive process.

LoRA is beneficial for:
- Making LLMs more accessible to users with limited computational resources.
- Rapidly adapting LLMs to new tasks or domains without significant retraining efforts.

In [13]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [14]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
   

In [15]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [16]:
modules = find_all_linear_names(model)
print(modules)

['q_proj', 'up_proj', 'down_proj', 'gate_proj', 'k_proj', 'v_proj', 'o_proj']


In [17]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [18]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


# 5. Run the Training

In [19]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Fine-Tuning with qLora and Supervised Fine-Tuning


In [20]:
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/97567 [00:00<?, ? examples/s]

Map:   0%|          | 0/24392 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
# Lets start training process
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



Step,Training Loss
1,2.002
2,1.6774
3,1.4704
4,1.3575
5,1.0361
6,1.0259
7,0.7747
8,0.7707
9,0.6834
10,0.8613


TrainOutput(global_step=100, training_loss=0.6253986147046089, metrics={'train_runtime': 2352.6828, 'train_samples_per_second': 0.17, 'train_steps_per_second': 0.043, 'total_flos': 3742951274078208.0, 'train_loss': 0.6253986147046089, 'epoch': 0.0})

In [22]:
# Push new model to hub
new_model = "finetuning-mistralai-codeinstruct"
trainer.model.save_pretrained(new_model)


In [28]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float32,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model_both",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 0 has a total capacty of 14.75 GiB of which 65.06 MiB is free. Process 230265 has 14.62 GiB memory in use. Of the allocated memory 14.20 GiB is allocated by PyTorch, and 287.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [25]:
#push model and tokenizer
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

NameError: name 'merged_model' is not defined

#6. Evaluating the model

In [26]:
def get_completion_merged(query: str, model, tokenizer) -> str:
  device = "cuda:0"

  prompt_template = """

  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]



  """
  prompt = prompt_template.format(query=query)

  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

  model_inputs = encodeds.to(device)

  generated_ids = merged_model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  decoded = tokenizer.batch_decode(generated_ids)
  return (decoded[0])

In [27]:
result = get_completion_merged(query="code the fibonacci series in python using reccursion", model=model, tokenizer=tokenizer)
print(result)

NameError: name 'merged_model' is not defined

In [None]:
result = get_completion_merged(query="Help me to construct the house and lis the things required", model=model, tokenizer=tokenizer)
print(result)