<a href="https://colab.research.google.com/github/Lmalviya/machineTranslationTask/blob/main/FineTune_text_summarization_with_lora_and__4bit_quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scope of this notebook
1. Load LLaMA in 4-bit quantization
2. Fine-Tune LLaMa with QLoRA for text summarization

In [1]:
!pip install bitsandbytes
!pip install transformers==4.31
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1

Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0
Collecting transformers==4.31
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstal

### Library

In [4]:
import time
import numpy as np
import random


import torch


from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import Dataset, load_dataset
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model

import evaluate

## Config

In [5]:
def get_config():
  return {
      'dataset': 'knkarthick/dialogsum',
      'model_id': 'NousResearch/Llama-2-7b-hf',
      'output_dir': '/content/output',
  }

### load dataset and convert it into instruction finetuning formate

In [6]:
def instructionFormate(data):
  newData = []
  for dataPoint in data:
    inputs = f"""
      INSTRUCTION: Summarize the following conversarion.
      ### INPUT:
      {dataPoint['dialogue']}

      ### Summary:
      {dataPoint['summary']}
    """
    item = {
        "dialogue": dataPoint['dialogue'],
        "summary": dataPoint['summary'],
        "inputs": inputs
    }
    newData.append(item)
  return newData

def get_data(config):
  data = {}
  raw_data = load_dataset(config['dataset'])
  train_ds = instructionFormate(raw_data['train'])
  val_ds = instructionFormate(raw_data['validation'])
  test_ds = instructionFormate(raw_data['test'])
  data['train'] = train_ds
  data['train'] = val_ds
  data['train'] = test_ds
  return data

# config = get_config()
# train_ds, val_ds, test_ds = get_data(config)

### Load model into 4-bit Quantize form

In [9]:
def get_model_and_tokenizer(config):
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  model = AutoModelForCausalLM.from_pretrained(config['model_id'], quantization_config=bnb_config, device_map="cpu")
  tokenizer = AutoTokenizer.from_pretrained(config['model_id'])
  tokenizer.pad_token = tokenizer.eos_token
  tokenizer.padding_side = 'right'
  return model, tokenizer


### ZERO Shot inference with LLaMA-2 7B

In [23]:
config = get_config()
data = get_data(config)
model, tokenizer = get_model_and_tokenizer(config)

dialogue = data['train'][0]['dialogue']
summary = data['train'][0]['summary']
prompt = data['train'][0]['inputs']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
tokenized_input = tokenizer(prompt, return_tensors='pt')
# print(tokenized_input['input_ids']) #['attention_mask']

In [29]:
tokenized_input = tokenizer(prompt, return_tensors='pt')
encoded_output = model.generate(tokenized_input['input_ids'], max_new_tokens=100)
output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f"Dialogue: \n{dialogue}\n")
print(dash_line)
print(f"Expected Summary: \n{summary}\n")
print(dash_line)
print(f"Generated Summary: \n{output}\n")
print(dash_line)

AttributeError: 'Tensor' object has no attribute 'update'

In [31]:
encoded_output = model.generate(tokenized_input['input_ids'])

Input length of input_ids is 435, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.


AssertionError: 

In [20]:
data.keys()

dict_keys(['train'])

In [21]:
 data['train'][0]

 'summary': 'Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.',

### Print trainable parameters function

In [None]:
def print_trainable_parameters(model):
  trainable_param = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_param += param.numel()

  print(
      f"Trainable Param: {} || All Param: {} || trainable percentage: {100 * trainable_params / all_param}%"
  )

### load 4bit training peft version and print model

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
print(model)

### Set LoRA parameters

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    # name of the layer is same as mention in the model architecture
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'], # layers name for which we want to finetune additional parameters
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

Lora_Model = get_peft_model(model, lora_config)
print_trainable_parameters(Lora_Model)

### Tensor board

In [None]:
%load_ext tensorboard
%tensorboard --logdir llama2-docsum-adapter/runs

### Training

In [None]:
training_arguments = TrainingArguments(
    pre_device_train_batch_size=config['batch_size'],
    gradient_accumulation_steps=config['gradient_step'],
    optim=config['optimName'],
    logging_step=1,
    learning_rate=config['lr'],
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=config['epochs'],
    evaluation_strategy='step',
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy='epoch',
    group_by_length=True,
    output_dir=config['output_dir'],
    report_to='tensorboard',
    save_safetensors=True,
    lr_scheduler_type='cosine',
    seed=42,
)

Lora_Model.config.use_cache = False

In [None]:
trainer = SFTTrainer(
    model = Lora_Model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=lora_config,
    dataset_text_field='prompt',
    max_seq_length=config['max_len'],
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
trainer.train()

### Save Model

In [None]:
# here we only save the additional trainable weigts not the frozen model weigts
trainer.model.save_pretrained(config['peft_model_path'])
tokenizer.save_pretrained(config['peft_model_path'])


### Inference

In [None]:
from transformers import TextStreamer
from peft import AutoPeftModelForCausalLM
from transformer import AutoTokenizer

In [None]:
Lora_Model.config.use_cache = True
Lora_Model.eval()

In [None]:
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    config['peft_model_path'],
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(config['peft_model_path'])


### Generate output from fine tune LLaMA+lora model

In [None]:
dialogue = data['test'][0]['dialogue']
summary = data['test'][0]['dialogue']
prompt = data['test'][0]['inputs']

tokenized_input = tokenizer(prompt, return_tensors='pt')
encoded_output = trained_model.generate(tokenized_input['input_ids'], max_new_tokens=100)
output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f"Dialogue: \n{dialogue}\n")
print(dash_line)
print(f"Expected Summary: \n{summary}\n")
print(dash_line)
print(f"Generated Summary: \n{output}\n")
print(dash_line)