<a href="https://colab.research.google.com/github/HAL22/loraTutorial/blob/main/Llama_finetune_instruction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Sep  6 19:42:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    24W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


# **Fine-tuning Llama model for instruction following. The resulting fine-tuned model will generate text conditioned on a provided prompt.**

In [4]:
%%capture
!pip install -q --upgrade bitsandbytes datasets accelerate loralib
!pip install -q --upgrade git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git
!pip install sentencepiece
!pip install trl

1. Imports

In [5]:
import pandas as pd
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
from datasets import Dataset
from transformers import LlamaTokenizer, LlamaForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from transformers import TrainingArguments

2. Prepping the data. Transforming the data from Huggingface to the *Alpaca format*.

In [6]:
#Load the dataset from the HuggingFace Hub
rd_ds = load_dataset("xiyuez/red-dot-design-award-product-description")

In [7]:
#Convert to pandas dataframe for convenient processing
rd_df = pd.DataFrame(rd_ds['train'])
rd_df.head()

Unnamed: 0,product,category,description,text
0,Biamp Rack Products,Digital Audio Processors,"“High recognition value, uniform aesthetics an...",Product Name: Biamp Rack Products;\n\nProduct ...
1,V33,Video Camera,The V33 livestreaming video camera ensures hig...,Product Name: V33;\n\nProduct Category: Video ...
2,HP LaserJet 5000-6000 and E700-E800 Series MFPs,Multi-Function Printers,The HP LaserJet 5000 to 6000 Series and E700 t...,Product Name: HP LaserJet 5000-6000 and E700-E...
3,Meaco Arete One 20L Dehumidifier,Heating and Air Conditioning Technology,The Meaco Arete One Dehumidifier is characteri...,Product Name: Meaco Arete One 20L Dehumidifier...
4,théATRE Glass Container for Loose Leaf Tea,Food Containers,The design and colouring of the théATRE Glass ...,Product Name: théATRE Glass Container for Loos...


In [8]:
#Combine the two attributes into an instruction string
rd_df['instruction'] = 'Create a detailed description for the following product: '+ rd_df['product']+', belonging to category: '+ rd_df['category']

In [9]:
rd_df = rd_df[['instruction', 'description']]
rd_df.head()

Unnamed: 0,instruction,description
0,Create a detailed description for the followin...,"“High recognition value, uniform aesthetics an..."
1,Create a detailed description for the followin...,The V33 livestreaming video camera ensures hig...
2,Create a detailed description for the followin...,The HP LaserJet 5000 to 6000 Series and E700 t...
3,Create a detailed description for the followin...,The Meaco Arete One Dehumidifier is characteri...
4,Create a detailed description for the followin...,The design and colouring of the théATRE Glass ...


In [10]:
#Get a 5000 sample subset for fine-tuning purposes
rd_df_sample = rd_df.sample(n=5000, random_state=42)

#Define template and format data into the template for supervised fine-tuning
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

{}

### Response:\n"""

In [11]:
rd_df_sample['prompt'] = rd_df_sample["instruction"].apply(lambda x: template.format(x))
rd_df_sample.rename(columns={'description': 'response'}, inplace=True)
rd_df_sample['response'] = rd_df_sample['response'] + "\n### End"
rd_df_sample = rd_df_sample[['prompt', 'response']]

rd_df_sample['text'] = rd_df_sample["prompt"] + rd_df_sample["response"]
rd_df_sample.drop(columns=['prompt', 'response'], inplace=True)

In [12]:
rd_df_sample.head()

Unnamed: 0,text
18952,Below is an instruction that describes a task....
12584,Below is an instruction that describes a task....
5702,Below is an instruction that describes a task....
20503,Below is an instruction that describes a task....
2480,Below is an instruction that describes a task....


In [13]:
rd_df_sample.iloc[1]['text']

'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n\nCreate a detailed description for the following product: iSHOXS BullBar ProX, belonging to category: Action Cam Mount\n\n### Response:\nThe iSHOXS BullBar ProX mount can be used to attach action cams as well as system and video cameras. Very resilient and flexible, it takes up little space, thanks to a design which has its focus on functionality, and can easily be positioned on bicycle and motorcycle handlebars. The torsion-resistant aluminium structure is rigidly bolted for interlocking and allows to take up to 1.75 kg. There are several vibrationabsorbing two-component inlays to protect painted and chrome-plated surfaces.\n### End'

In [14]:
dataset = Dataset.from_pandas(rd_df_sample).train_test_split(test_size=0.05, seed=42)

3. Testing model performance before fine-tuning

- Model configs

In [15]:
nf4_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16
)

model_path = 'openlm-research/open_llama_3b_v2'

4. Fine-tuning

Two of these hyperparameters, **r** and **target_modules** are empirically shown to affect adaptation quality significantly and will be the focus of the tests that follow. The other hyperparameters are kept constant at the values indicated above for simplicity.

**r** represents the rank of the low rank matrices learned during the finetuning process. **As this value is increased, the number of parameters needed to be updated during the low-rank adaptation increases. Intuitively, a lower r may lead to a quicker, less computationally intensive training process, but may affect the quality of the model thus produced**. However, increasing r beyond a certain value may not yield any discernible increase in quality of model output. How the value of r affects adaptation (fine-tuning) quality will be put to the test shortly

When fine-tuning with LoRA, it is possible to target specific modules in the model architecture. The adaptation process will target these modules and apply the update matrices to them. Similar to the situation with "r," targeting more modules during LoRA adaptation results in increased training time and greater demand for compute resources. Thus, it is a common practice to only target the attention blocks of the transformer. However, recent work as shown in the [QLoRA paper](https://arxiv.org/abs/2305.14314) by Dettmers et al. suggests that targeting all linear layers results in better adaptation quality

- **Config**

In [16]:
#If only targeting attention blocks of the model
target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
#target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

lora_config = LoraConfig(
r=8,
target_modules = target_modules,
lora_alpha=8,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM")

nf4_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16
)

model_path = 'openlm-research/open_llama_3b_v2'

# Training hyperparameters
base_dir = "outputs"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = 'adamw_hf'
learning_rate = 1e-3
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "linear"

training_args = TrainingArguments(
    output_dir=base_dir,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    num_train_epochs = 1.0,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

- **Loading the model to finetune**

In [17]:
tokenizer = LlamaTokenizer.from_pretrained(model_path)

finetune_model = LlamaForCausalLM.from_pretrained(
    model_path, device_map='auto', quantization_config=nf4_config,torch_dtype=torch.float16
)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [18]:
print(finetune_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 3200, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (k_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (v_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (o_proj): Linear4bit(in_features=3200, out_features=3200, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3200, out_features=8640, bias=False)
          (up_proj): Linear4bit(in_features=3200, out_features=8640, bias=False)
          (down_proj): Linear4bit(in_features=8640, out_features=3200, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
   

In [19]:
import re
model_modules = str(finetune_model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

names = []
# Print the names of the Linear layers
for name in linear_layer_names:
    names.append(name)
target_modules = list(set(names))

In [20]:
target_modules

['down_proj',
 'q_proj',
 'gate_proj',
 'up_proj',
 'lm_head',
 'v_proj',
 'o_proj',
 'k_proj']

In [21]:
trainer = SFTTrainer(
finetune_model,
train_dataset=dataset['train'],
eval_dataset = dataset['test'],
dataset_text_field="text",
max_seq_length=256,
args=training_args,
)

Using pad_token, but it is not set yet.


Map:   0%|          | 0/4750 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]



In [27]:
with torch.cuda.amp.autocast(True):
  trainer.train()

ValueError: ignored

In [None]:
# Upload finetuned model to huggingface

HUGGING_FACE_USER_NAME = "Thethela"

from huggingface_hub import notebook_login
notebook_login()

model_name = 'finetune-2-openlm-research/open_llama_3b_v2'

finetune_model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

In [None]:
# Loading the peft model

peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
lora_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
lora_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
def extract_response_text(input_string):
    start_marker = '### Response:'
    end_marker = '###'

    start_index = input_string.find(start_marker)
    if start_index == -1:
        return None

    start_index += len(start_marker)

    end_index = input_string.find(end_marker, start_index)
    if end_index == -1:
        return input_string[start_index:]

    return input_string[start_index:end_index].strip()

In [None]:
test_strings = ["Create a detailed description for the following product: Corelogic Smooth Mouse, belonging to category: Optical Mouse",
"Create a detailed description for the following product: Hoover Lightspeed, belonging to category: Cordless Vacuum Cleaner",
"Create a detailed description for the following product: Flattronic Cinematron, belonging to category: High Definition Flatscreen TV"]

In [None]:
def get_predictions(model, tokenzier):
  predictions = []
  for test in test_strings:
    prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

  ### Instruction:
  {}

  ### Response:""".format(test)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')

    generation_output = model.generate(
      input_ids=input_ids, max_new_tokens=156
    )
    predictions.append(tokenizer.decode(generation_output[0]))

  return predictions

In [None]:
def print_predictions(predictions):
  for i in range(3):
    pred = predictions[i]
    text = test_strings[i]
    print(text+'\n')
    print(extract_response_text(pred))
    print('--------')


Results of a non-finetune model

In [None]:
pre_tokenizer = LlamaTokenizer.from_pretrained(model_path)

pre_model = LlamaForCausalLM.from_pretrained(
    model_path, device_map='auto',
)

pre_predictions = get_predictions(pre_model,pre_tokenizer)

print_predictions(pre_predictions)

Result of finetune model

In [None]:
finetune_predictions = get_predictions(lora_model,lora_tokenizer)

print_predictions(finetune_predictions)