# Fine-tuning of Llama using 4-bit Quantization
Load llama model "Trelis/Llama-2-7b-chat-hf-sharded-bf16" in **quantized form**.

Hint(Explore the library Bits and Bytes).

Apply Lora, and fine tune it on the dataset of your choice.
You can use this dataset as well: Abirate/english_quotes.



In [1]:
# Install required libraries
!pip install transformers datasets peft bitsandbytes accelerate

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metad

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

# Install

Load the model to use: Llama-7B!

In [3]:
def load_model():
    model_name = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"

    # 4-bit quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    # Load the quantized model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [5]:
def load_model():
    model_name = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"

    # 4-bit quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    # Load the quantized model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Training Setup

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [6]:
def prepare_model(model):
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    return model

model = prepare_model(model)

# Data Setup

In [7]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("Abirate/english_quotes")
train_dataset = dataset['train']

def tokenize_function(examples):
    # Tokenize the 'quote'
    inputs = tokenizer(examples["quote"], padding="max_length", truncation=True, max_length=128)
    # Join tags into a single string and tokenize
    tags = [" ".join(tags) for tags in examples["tags"]]
    labels = tokenizer(tags, padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2508 [00:00<?, ? examples/s]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

Dataset({
    features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2508
})

In [9]:
tokenized_dataset = tokenized_dataset.remove_columns(["quote", "tags" , "author"])

In [10]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2508
})

# Training

In [11]:
# Training Function
def train(model, tokenizer, dataset):
    # Define training parameters
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        save_steps=10,
        logging_steps=10,
        learning_rate=3e-4,
        weight_decay=0.01,
        fp16=True,
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

train(model, tokenizer, tokenized_dataset)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,4.5447
20,1.0198
30,0.6444
40,0.6217
50,0.599
60,0.5579
70,0.4923
80,0.6074
90,0.5128
100,0.4889




# Inference

In [15]:
def generate_text(prompt, model, tokenizer, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
prompt = "The purpose of life is to enjoy"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)

References

Quantization:

https://youtu.be/6S59Y0ckTm4?si=kDhMScvCrOFK9uZa

https://youtu.be/0VdNflU08yA?si=vjjRTgLsWsTSvntz

https://medium.com/@techresearchspace/what-is-quantization-in-llm-01ba61968a51

https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-quantization

https://medium.com/@metechsolutions/llm-by-examples-use-bitsandbytes-for-quantization-cf33aa8bfe16

https://huggingface.co/blog/merve/quantization