In [None]:
!pip install transformers==4.31.0
!pip install torch==2.1.0
!pip install langchain==0.0.228
!pip install chromadb==0.3.26
!pip install sentence-transformers==2.2.2
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install git+https://github.com/huggingface/accelerate
!pip install git+https://github.com/huggingface/peft
!pip install datasets
!pip install auto_gptq==0.4.2

### Analysis of dataset with ecommerce FAQ

In [4]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("NebulaByte/E-Commerce_FAQs")

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/565k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset['train']

Dataset({
    features: ['parent_category', 'category_id', 'category', 'question_id', 'question', 'answer', 'faq_url', 'que_ans'],
    num_rows: 659
})

In [6]:
dataset = dataset['train']

In [None]:
# all questions are unique
len(set(dataset["question_id"]))

659

In [None]:
pd.Series(dataset["category"]).nunique()

34

In [None]:
pd.Series(dataset["category"]).value_counts()

Flipkart Plus                               50
SuperCoins                                  43
Payment                                     43
Ather                                       42
Bounce                                      41
Ampere                                      40
BGauss                                      39
Order                                       30
Cancellations and Returns                   29
Hero MotoCorp                               24
Shopping                                    22
COVID-19 Protect (Coronavirus Insurance)    22
Flights                                     21
Aegon Life Insurance                        20
Flipkart Axis Bank Credit Card              20
Credit Card No Cost EMI                     19
Gift Cards                                  19
Bajaj Finserv EMI                           18
Flipkart Quick                              17
Pickup Stores                               17
Login & My Account                          15
Warranty     

In [None]:
dataset.filter(lambda x: x['category'] == 'Flipkart Plus')['question'][:5]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

['If I cancel or return an item from my order, will the SuperCoins credited be deducted?',
 'What is SuperCoins?',
 'Where can I read more detailed terms and conditions of the Flipkart Plus program?',
 'Can I transfer SuperCoins to different Flipkart account?',
 "If I choose 'SuperCoins Price' option, will I still be eligible to avail bank offers/other offers?"]

In [None]:
dataset.filter(lambda x: x['category'] == 'SuperCoins')['question'][:5]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

['What are SuperCoin Milestones?',
 'When benefits are unlocked as part of SuperCoin Milestones, will there be deduction of SuperCoins?',
 'How long will the benefit be unlocked as part of my SuperCoin Milestones?',
 'How do I renew my benefits as part of SuperCoin Milestones?',
 'What is Cash and Coins for Rewards program?']

In [None]:
dataset.filter(lambda x: x['category'] == 'Payment')['question'][:5]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

["What is Flipkart's credit card EMI payment option?",
 'How can I order for large quantities of the product as part of a corporate order?',
 'How can I label my saved cards?',
 "What is a 'card label'?",
 "What is the 'Save Card' feature?"]

In [None]:
dataset.filter(lambda x: x['category'] == 'Ather')['question'][:5]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

['Is the price shown on the product page for an Ather 2-wheeler the final amount?',
 'The option to buy an Ather 2-wheeler is not available at my pincode.',
 'I have bought an Ather 2-wheeler but I want to have a different model. What should I do?',
 'What is the advantage of buying an Ather 2-wheeler vehicle through Flipkart?',
 'What is the process after paying the ex-showroom price for an Ather 2-wheeler on Flipkart?']

In [None]:
dataset.filter(lambda x: x['category'] == 'Bounce')['question'][:5]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

["Can I pay for the Bounce vehicle's registration and insurance on Flipkart?",
 'Will I get any extra discount/no-cost EMI at the Bounce authorised dealership for insurance and vehicle registration charges?',
 'I have bought the Bounce 2-wheeler but I want to have a different model. What should I do?',
 'In which all cities is Bounce operational?',
 'How can I track my order for the Bounce 2-wheeler after completing my payment on Flipkart?']

In [None]:
# there are two repeated questions
pd.Series(dataset["question"]).nunique()

657

In [None]:
# the answers are slightly different, it's ok
dupl_q = [
    'What is a convenience fee? Why am I being charged?',
    'If I have a saved card on Flipkart, will I be able to use a new card for my next payment?'
]
dataset.filter(lambda x: x['question'] in dupl_q)[:10]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

{'parent_category': [None, None, 'Flipkart Travel', 'Flipkart Travel'],
 'category_id': ['55bf48f4d00000490011ffa5',
  '55bf48f4d00000490011ffa5',
  '62e91d68f86ae76c188881d2',
  '63049d0ea0eaa942e17fe1d3'],
 'category': ['Payment', 'Payment', 'Flights', 'Hotels'],
 'question_id': ['55bb5c0f31000025007864d5',
  '5ae1a640130000330096cea0',
  '62e9261914e5755075e1d7f0',
  '6304a83d64d0b60fba01a513'],
 'question': ['If I have a saved card on Flipkart, will I be able to use a new card for my next payment?',
  'If I have a saved card on Flipkart, will I be able to use a new card for my next payment?',
  'What is a convenience fee? Why am I being charged?',
  'What is a convenience fee? Why am I being charged?'],
 'answer': ["Yes. Even if you've saved your card details on Flipkart, you always have the option to use any other credit/debit card to pay.",
  'Yes. Even if you have saved your card on Flipkart, you always have the option to use any other credit/debit card for making a payment.',
 

In [None]:
# repeated answers
pd.Series(dataset["answer"]).nunique()

624

In [None]:
# sometimes an answer does not contain very specific information - it only recommends some actions or
# route the user to specific departemnts or service providers in a compeny
sel_answ = 'In case you relocate to a different state, please contact the local Regional Transport Office for assistance with registration change.'
dataset.filter(lambda x: x['answer'] == sel_answ)[:10]

Filter:   0%|          | 0/659 [00:00<?, ? examples/s]

{'parent_category': ['Electric Vehicle ',
  'Electric Vehicle ',
  'Electric Vehicle ',
  'Electric Vehicle ',
  'Electric Vehicle '],
 'category_id': ['62da9d35aca9fa77e9e5c342',
  '6308ea59f86ae76c18b1420d',
  '632beac264d0b60fba412142',
  '632bec61a0eaa942e1bed2d9',
  '632fc25214e57550754db0d8'],
 'category': ['Bounce', 'Ampere', 'BGauss', 'Ather', 'Hero MotoCorp'],
 'question_id': ['62dad38672a08c71327988bd',
  '6309236364d0b60fba08e0d6',
  '632d1df4a0eaa942e1c2d9c1',
  '632d2314a0eaa942e1c2e9d2',
  '632fdefb64d0b60fba4d87df'],
 'question': ['In case I want to relocate to a different state, do I need to get the Bounce vehicle registered in that state also, and will the brand help me with that?',
  'In case I want to relocate to a different state, do I need to get Ampere Magnus EX registered in that state also and will the brand help me with that?',
  'In case I want to relocate to a different state, do I need to get the BGauss vehicle registered in that state also and will the bran

### Fine-tune Llama-2 quantized model on FAQ data using LoRa

With small computational resources the only method to fine-tune LLM is to use LoRa.
There are two options available:
* using `GPTQ` - if the checkpoint you would like to use is saved in this format
* using `bitsandbytes` - if you want to use original model which you will load in 4-bit format.

It was shown that for fine-tuning `bitsandbytes` is faster than `GPTQ`. Also you can not merge LoRa weights with GPTQ quantized model into one model.

In [35]:
from functools import partial
import os
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

**Option 1.** Load big model in 4-bit format.

You need this config if you would like to fine-tune checkpoint which is stored unquantized

Example:
* https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing#scrollTo=XIyP_0r6zuVc

In [9]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
model_name = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True, #  Load model in 4-bit precision mode
    bnb_4bit_use_double_quant = True, # Nested quantization for 4-bit model
    bnb_4bit_quant_type = "nf4", # Quantization data type for 4-bit model
    bnb_4bit_compute_dtype = torch.bfloat16, # Computation data type for 4-bit model
)
n_gpus = torch.cuda.device_count()
max_memory = f'{15000}MB' # for Tesla T4

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto", # dispatch the model efficiently on the available resources
    max_memory = {i: max_memory for i in range(n_gpus)},
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
print("tokenizer.pad_token", tokenizer.pad_token)
# Set padding token as EOS token because it is not set by default
#  but we need it because we would like train the model in batches
tokenizer.pad_token = tokenizer.eos_token
print("tokenizer.pad_token", tokenizer.pad_token)

(…)a/Llama-2-7b-hf/resolve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)b-hf/resolve/main/generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

(…)7b-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)lama-2-7b-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


tokenizer.pad_token None
tokenizer.pad_token </s>


In [22]:
[getattr(model.config, length_setting, None) for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]]

[None, 4096, None]

Prepare dataset for fine-tuning: prepare prompt and tokenize it

In [23]:
max_length = 4096

In [12]:
instruction = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information ans answer `I do not know`.
"""

In [14]:
def create_prompt(instruction, input, output):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset
    """

    # Initialize static strings for the prompt template
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{instruction}"
    input_context = f"{INPUT_KEY}\n{input}"
    response = f"{RESPONSE_KEY}\n{output}"
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n\n".join(parts)

    return {"formatted_prompt": formatted_prompt}

In [15]:
dataset = dataset.map(lambda x: create_prompt(instruction=instruction, input=x['question'], output=x['answer']))

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

In [16]:
dataset

Dataset({
    features: ['parent_category', 'category_id', 'category', 'question_id', 'question', 'answer', 'faq_url', 'que_ans', 'formatted_prompt'],
    num_rows: 659
})

In [18]:
print(dataset[0]['formatted_prompt'])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:

You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information ans answer `I do not know`.


Input:
I missed the delivery of my order today. What should I do?

### Response:
The courier service delivering your order usually tries to deliver on the next business day in case you miss a delivery.
You can check your SMS for more details on when the courier service will try to deliver again.
 

### End


In [29]:
def tokenize(dataset_sample, tokenizer, max_length):
  return tokenizer(
        dataset_sample["formatted_prompt"],
        max_length = max_length,
        truncation = True,
    )

In [30]:
# Tokenize prompts and remove unused columns. NO PADDING
tokenize_function = partial(tokenize, max_length = max_length, tokenizer = tokenizer)
dataset = dataset.map(
    tokenize_function,
    batched = True,
    remove_columns = ['parent_category', 'category_id', 'category', 'question_id', 'question', 'answer', 'faq_url', 'que_ans', 'formatted_prompt'],
)

# Shuffle dataset
dataset = dataset.shuffle(seed = 42)

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

In [31]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 659
})

Define LoRa configuration

In [33]:
def find_all_linear_names(model):
    """
    Find modules to apply LoRA to.

    :param model: PEFT model
    """

    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    print(f"LoRA module names: {list(lora_module_names)}")
    return list(lora_module_names)


def print_trainable_parameters(model, use_4bit = False):
    """
    Prints the number of trainable parameters in the model.

    :param model: PEFT model
    """

    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        num_params = param.numel()
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel
        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
    )

In [36]:
# Get linear module names to add LoRa adapters for them
target_modules = find_all_linear_names(model)

LoRA module names: ['o_proj', 'q_proj', 'up_proj', 'v_proj', 'gate_proj', 'k_proj', 'down_proj']


In [38]:
peft_config = LoraConfig(
    r = 16, # LoRA attention dimension
    lora_alpha = 64, # Alpha parameter for LoRA scaling
    target_modules = target_modules,
    lora_dropout = 0.1, # Dropout probability for LoRA layers
    bias = "none",
    task_type = "CAUSAL_LM"
)

In [39]:
model.gradient_checkpointing_enable()
# Prepare the model for training: set precision of LM head and LayerNorm to fp32
model = prepare_model_for_kbit_training(model)
# convert to PeftModel using config
model = get_peft_model(model, peft_config)
# Print information about the percentage of trainable parameters
print_trainable_parameters(model)

All Parameters: 3,540,389,888 || Trainable Parameters: 39,976,960 || Trainable Parameters %: 1.1291682911958425


Run training

In [46]:
# Training parameters
args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 5,
        output_dir = ".",
        optim = "paged_adamw_32bit",
        save_strategy="no", # do not save any checkpoints
    )

# run training
trainer = Trainer(
    model = model,
    train_dataset = dataset,
    args = args,
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)
model.config.use_cache = False

do_train = True

# Launch training and log metrics
print("Training...")

if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...




Step,Training Loss
5,2.2752
10,1.0717
15,0.5637
20,0.5446
25,0.6096
30,0.5543
35,0.4855
40,0.447
45,0.4826
50,0.526


***** train metrics *****
  epoch                    =        0.3
  total_flos               =   971089GF
  train_loss               =      0.756
  train_runtime            = 0:07:19.61
  train_samples_per_second =      0.455
  train_steps_per_second   =      0.114
{'train_runtime': 439.6175, 'train_samples_per_second': 0.455, 'train_steps_per_second': 0.114, 'total_flos': 1042698964230144.0, 'train_loss': 0.7560351705551147, 'epoch': 0.3}


In [None]:
print("Saving last checkpoint of the model...")
output_dir = "best_model"
os.makedirs(output_dir, exist_ok = True)
trainer.model.save_pretrained(output_dir) # save last PeftModel checkpoint

# Free memory for merging weights
torch.cuda.empty_cache()

output_merged_dir = "best_merged_model"
os.makedirs(output_merged_dir, exist_ok = True)
# Load fine-tuned weights
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map = "auto", torch_dtype = torch.bfloat16)
# Merge the LoRA layers with the base model
model = model.merge_and_unload()
# Save fine-tuned model at a new location
model.save_pretrained(output_merged_dir, safe_serialization = True)
tokenizer.save_pretrained(output_merged_dir)

**Option 2.** Load a GPTQ-quantized model and freeze it.

Example: https://gist.github.com/SunMarc/dcdb499ac16d355a8f265aa497645996

In [None]:
# But there are available not a lot of VRAM so we will use GPTQ checkpoint
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
print("tokenizer.pad_token", tokenizer.pad_token)
# Set padding token as EOS token because it is not set by default
#  but we need it because we would like train the model in batches
tokenizer.pad_token = tokenizer.eos_token
print("tokenizer.pad_token", tokenizer.pad_token)

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


tokenizer.pad_token None
tokenizer.pad_token </s>


In [None]:
max_memory = f'{15000}MB' # for Tesla T4
model_basename = "model"
device = "cuda:0"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config = bnb_config,
#     device_map = device, # or auto to dispatch the model efficiently on the available resources
#     max_memory = {i: max_memory for i in range(n_gpus)},
# )

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    revision="gptq-4bit-32g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    device=device,
    use_triton=False,
    quantize_config=None,
    disable_exllama=True # exllama does not support fine tuning
  )

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading (…)der_True/config.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

Downloading (…)quantize_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/4.28G [00:00<?, ?B/s]



### Fine-tune Mistral quantized model on FAQ data

https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF

### Use RAG and Llama-2 quantized model to answer questions on FAQ data
