<a href="https://colab.research.google.com/github/ManuSinghYadav/gemma-7b-ft-lora-500-alpaca/blob/main/Gemma-7b-ft-lora-500-alpaca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Previous [notbook](https://colab.research.google.com/drive/1xFj4oznNe-i4Vgwaj21YZwbp7RQUaXrO?authuser=1#scrollTo=WsUlOjX4rVEY).

##Importing libraries & Logging in

In [1]:
!pip install -q peft datasets evaluate trl bitsandbytes transformers accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━

In [2]:
import torch
import pandas as pd
import transformers
import huggingface_hub
from google.colab import userdata
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

In [3]:
huggingface_hub.login(token = userdata.get('HF_TOKEN_WRITE'))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


##Downloading Model and Tokenizer

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [16]:
model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", quantization_config = bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

##Importing Dataset

In [7]:
dataset = load_dataset("yahma/alpaca-cleaned", split="train")

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 51760
})

In [9]:
dataset = dataset.remove_columns(['input'])
dataset_limit = dataset.select(range(300))
dataset_limit

Dataset({
    features: ['output', 'instruction'],
    num_rows: 300
})

In [10]:
dataset_split = dataset_limit.train_test_split(test_size=0.1)
dataset_split

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction'],
        num_rows: 270
    })
    test: Dataset({
        features: ['output', 'instruction'],
        num_rows: 30
    })
})

##Tokenizing

In [11]:
df_train = pd.DataFrame(dataset_split['train'])
df_test = pd.DataFrame(dataset_split['test'])

df_train['prompt'] = "###Input: " + df_train['instruction'] + "\n\n" + "###Output: " + df_train['output'] + "\n"
df_test['prompt'] = "###Input: " + df_test['instruction'] + "\n\n" + "###Output: " + df_test['output'] + "\n"

In [12]:
base_model_id = "google/gemma-7b"
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True
)
tokenizer.pad_token = tokenizer.eos_token

In [13]:
max_length = 512 #Need to be changed

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        prompt,
        #return_tensors="pt",
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    #result["prompt"] = prompt
    return result

train_dataset = df_train['prompt']
val_dataset = df_test['prompt']
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = val_dataset.map(generate_and_tokenize_prompt)

In [14]:
tokenized_train_dataset[0]

{'input_ids': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

##Setup QLoRA and LoRA config

In [15]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [16]:
peft_config = LoraConfig(task_type="CASUAL_LM", r=8, lora_alpha=32, lora_dropout=0.1, target_modules =  ["q_proj", "o_proj", "k_proj", "v_proj","gate_proj", "up_proj", "down_proj"])

model_peft = model.enable_input_require_grads()
model_peft = get_peft_model(model, peft_config)
model_peft.print_trainable_parameters()

trainable params: 25,001,984 || all params: 8,562,682,880 || trainable%: 0.29198773737606876


##Training

In [17]:
project = "gemma-7b-ft-80row-alpaca-correcting-mistakes"
output_dir = "./" + project

trainer = transformers.Trainer(
    model=model_peft,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
				per_device_eval_batch_size=2,
        gradient_checkpointing=True,
        max_steps=150,
        num_train_epochs=1,
        learning_rate=5e-5,
				gradient_accumulation_steps=3,
        optim="paged_adamw_8bit",
        save_strategy="steps",
        save_steps=25,
				logging_strategy="steps",
				logging_steps = 1,
        evaluation_strategy="no",
        eval_steps=2,
        do_eval=False,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer_stats = trainer.train()



Step,Training Loss
1,3.9406
2,2.3534
3,1.9076
4,1.644
5,1.3447
6,1.638
7,1.4449
8,1.3639
9,1.4226
10,1.7926




In [19]:
trainer_stats

TrainOutput(global_step=150, training_loss=0.9718863366047541, metrics={'train_runtime': 1984.5384, 'train_samples_per_second': 0.454, 'train_steps_per_second': 0.076, 'total_flos': 2.1499778433024e+16, 'train_loss': 0.9718863366047541, 'epoch': 3.33})

##Pushing to hub

In [18]:
trainer.push_to_hub()

adapter_model.safetensors:   0%|          | 0.00/100M [00:00<?, ?B/s]

events.out.tfevents.1709637347.3f4bbc8526e7.1808.0:   0%|          | 0.00/36.6k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/msinghy/gemma-7b-ft-80row-alpaca-correcting-mistakes/commit/d8744888f160a41cc25f84452d522a07e501943b', commit_message='End of training', commit_description='', oid='d8744888f160a41cc25f84452d522a07e501943b', pr_url=None, pr_revision=None, pr_num=None)