In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [2]:
import os
import transformers
import torch
from google.colab import userdata
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

### HugginFace Token

 - In order to access gemma model from huggingface we need the access token from huggingface.

 - Crete your account if not already then create the access token in read format

 - In Colab, add your key in secrets section with the name of HF_TOKEN or any other name

In [3]:
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

### Loading Gemma 2b and Quantization Process

In [4]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map={"":0}, token=os.environ['HF_TOKEN'])



tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

### Inferencing the Model

In [25]:
text = "Quote: Try not to become a man of success, "
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Try not to become a man of success, 
but rather try to become a man of value.
- Albert Einstein

The quote above is


### Finetuning the Gemma Model
  - Dataset used: Abirate/english_qoutes |  | <a href="https://huggingface.co/datasets/Abirate/english_quotes">Link</a>

In [28]:
os.environ["WANDB_DISABLED"] = "true"
# disable the automatic logging of metrics, system information, and other data to Weights & Biases (W&B), which is a tool for tracking and visualizing machine learning experiments.

In [29]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=64,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

'''
q_proj (Query Projection): This is a linear transformation applied to the query part of the attention mechanism.
It projects the query input into a different space to calculate attention scores.

k_proj (Key Projection): Similar to q_proj, this is a linear transformation applied to the key part of the attention mechanism.
It helps in comparing keys with queries to compute attention scores.

v_proj (Value Projection): This linear transformation is applied to the value part of the attention mechanism.
After attention scores are computed, they are used to weigh these values to get the output of the attention layer.

gate_proj (Gate Projection): This is part of the feed-forward network within a transformer layer.
It usually involves a gating mechanism like GELU or sigmoid to control the flow of information.

up_proj & down_proj (Up and Down Projections): These are linear transformations used in the feed-forward network of a transformer layer.
They typically increase (up_proj) or reduce (down_proj) the dimensionality of the input data as part of the processing.
'''


'\nq_proj (Query Projection): This is a linear transformation applied to the query part of the attention mechanism. \nIt projects the query input into a different space to calculate attention scores.\n\nk_proj (Key Projection): Similar to q_proj, this is a linear transformation applied to the key part of the attention mechanism. \nIt helps in comparing keys with queries to compute attention scores.\n\nv_proj (Value Projection): This linear transformation is applied to the value part of the attention mechanism. \nAfter attention scores are computed, they are used to weigh these values to get the output of the attention layer.\n\ngate_proj (Gate Projection): This is part of the feed-forward network within a transformer layer. \nIt usually involves a gating mechanism like GELU or sigmoid to control the flow of information.\n\nup_proj & down_proj (Up and Down Projections): These are linear transformations used in the feed-forward network of a transformer layer. \nThey typically increase (u

In [30]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [31]:
def formatting_func(example):
    text = f"Quote: {example['quote'][0]}\nAuthor: {example['author'][0]}<eos>"
    return [text]
formatting_func(data["train"])

['Quote: “Be yourself; everyone else is already taken.”\nAuthor: Oscar Wilde<eos>']

In [33]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=20,
        # Copied from other hugging face tuning blog posts
        learning_rate=2e-4,
        fp16=True,
        # It makes training faster
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
    packing=False
)
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
1,1.8092
2,0.6791
3,1.1
4,1.1027
5,0.4401
6,1.2924
7,1.1608
8,0.3786
9,0.6052
10,0.5579


TrainOutput(global_step=20, training_loss=0.6546215504407883, metrics={'train_runtime': 14.609, 'train_samples_per_second': 5.476, 'train_steps_per_second': 1.369, 'total_flos': 11954040508416.0, 'train_loss': 0.6546215504407883, 'epoch': 13.33})

In [34]:
text = "Quote: Try not to become a man of success, "
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)

In [35]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Quote: Try not to become a man of success, 
but rather try to become a man of value.
Author: Albert Einstein
