In [1]:
#export HF_TOKEN = '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'

In [2]:
import os
from datasets import load_dataset, load_from_disk
from transformers import PaliGemmaProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig, TrainingArguments, Trainer
import torch
from peft import get_peft_model, LoraConfig

In [3]:
os.environ["HF_TOKEN"] =  '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'

# Loading the DataSet

In [4]:
data = load_dataset('HuggingFaceM4/VQAv2', split="train[:10%]")
cols_remove = ["question_type", "answers", "answer_type", "image_id", "question_id"]
data = data.remove_columns(cols_remove)
split_data = data.train_test_split(test_size=0.05)
train_data = split_data["test"]
print(train_data[0])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


{'multiple_choice_answer': 'yes', 'question': 'Is it probably cold here?', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x501 at 0x28168E65930>}


# Load Model 

In [5]:
model_id = "leo009/paligemma-3b-pt-224"
#model_id = "google/paligemma-3b-pt-224"
processor = PaliGemmaProcessor.from_pretrained(model_id)
device = "cuda"
image_token = processor.tokenizer.convert_tokens_to_ids("<image>")
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
for param in model.vision_tower.parameters():
    param.required_grad = True 

for param in model.multi_modal_projector.parameters():
    param.required_grad = True

In [22]:
for param in model.parameters():
    print(param.requires_grad)  

True
True
True
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
True
True
False
False
True
True
False
False
False
False
False
False
F

# Loading Quantised Model 

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16
)

In [8]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj",
                    "gate_proj", "up_proj", "down_proj"]
)

In [9]:
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id,quantization_config=bnb_config, device_map = {"":0})

model = get_peft_model(model,lora_config)

model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 11,298,816 || all params: 2,934,765,296 || trainable%: 0.3849989644964099


# Finetuning the Model

In [10]:
def collate_fn(examples):
    texts = ["answer " + example["question"] for example in examples]
    labels = [example['multiple_choice_answer'] for example in examples]
    images = [example["image"].convert("RGB") for example in examples]
    tokens = processor(text=texts, images=images, suffix=labels,
                       return_tensors="pt", padding="longest",
                       tokenize_newline_separately=False)
    tokens = tokens.to(torch.bfloat16).to(device)
    return tokens 

In [11]:
from transformers import TrainingArguments
args=TrainingArguments(
            num_train_epochs=2,
            remove_unused_columns=False,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            learning_rate=2e-5,
            weight_decay=1e-6,
            adam_beta2=0.999,
            logging_steps=100,
            optim="adamw_hf",
            save_strategy="steps",
            save_steps=1000,
            push_to_hub=True,
            save_total_limit=1,
            output_dir="paligemma_vqav2",
            bf16=True,
            report_to=["tensorboard"],
            dataloader_pin_memory=False
        )


In [12]:
trainer = Trainer(
    model = model,
    train_dataset = train_data,
    data_collator=collate_fn,
    args = args
)

In [13]:
import torch

torch.cuda.empty_cache()


In [23]:
trainer.train()

  0%|          | 0/276 [00:00<?, ?it/s]

{'loss': 49391.23, 'grad_norm': nan, 'learning_rate': 2e-05, 'epoch': 0.72}
{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2e-05, 'epoch': 1.44}
{'train_runtime': 27857.5245, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.01, 'train_loss': 17895.373188405796, 'epoch': 1.99}


TrainOutput(global_step=276, training_loss=17895.373188405796, metrics={'train_runtime': 27857.5245, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.01, 'total_flos': 1.7286077798096352e+16, 'train_loss': 17895.373188405796, 'epoch': 1.9891891891891893})

In [24]:
trainer.push_to_hub("Dhanushkumar/paligemma_VQAv2")

CommitInfo(commit_url='https://huggingface.co/Dhanushkumar/paligemma_vqav2/commit/a1c0517d4345e085ae2793bea386f0606ccceadf', commit_message='Dhanushkumar/paligemma_VQAv2', commit_description='', oid='a1c0517d4345e085ae2793bea386f0606ccceadf', pr_url=None, pr_revision=None, pr_num=None)

# Test the trained Model

In [25]:
import os
import sys
import site
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, PaliGemmaProcessor, AutoModelForPreTraining
from huggingface_hub import login
import torch
from PIL import Image
import requests

In [26]:
# Setup site packages path
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
site_packages_path = os.path.expanduser(f'~/.local/lib/python{python_version}/site-packages')
site.addsitedir(site_packages_path)

In [27]:
# Login to Hugging Face Hub
token = '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'
login(token=token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\dhanu\.cache\huggingface\token
Login successful


`Load PeftConfig and Base Model`

In [None]:
config = PeftConfig.from_pretrained("Dhanushkumar/paligemma_VQAv2")

base_model = AutoModelForPreTraining.from_pretrained("leo009/paligemma-3b-pt-224")
model = PeftModel.from_pretrained(base_model,"Dhanushkumar/paligemma_VQAv2")

# Loading and Processing the Image

In [None]:
input_text = "What is in this image?"
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/bee.JPG?download=true"
input_image = Image.open(requests.get(img_url, stream=True).raw)


In [None]:
# Loading PaliGemma Processor
processor = PaliGemmaProcessor.from_pretrained("leo009/paligemma-3b-pt-224")


In [None]:
# Preprocessing Inputs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = processor(text=input_text, images=input_image, padding="longest", do_convert_rgb=True, return_tensors="pt").to(device)
model.to(device)
inputs = inputs.to(dtype=model.dtype)


In [None]:
# Generating and Decoding Output
with torch.no_grad():
    output = model.generate(**inputs, max_length=496)

print(processor.decode(output[0], skip_special_tokens=True))

# Analyse VQAv2 Dataset

In [None]:
import os
import re
from datasets import load_dataset
from PIL import Image

# Create a directory to save images
os.makedirs('saved_images', exist_ok=True)

# Load and preprocess the dataset
ds = load_dataset('HuggingFaceM4/VQAv2', split="train[:10%]")
cols_keep = ["question_type", "answers", "answer_type", "image", "image_id", "question_id", "question"]
ds = ds.remove_columns([col for col in ds.column_names if col not in cols_keep])
split_ds = ds.train_test_split(test_size=0.05)
train_ds = split_ds["test"]

# Helper function to sanitize filenames
def sanitize_filename(text):
    text = re.sub(r'\W+', '_', text)
    return text[:50]  # Truncate to 50 characters for safety

# Save 10 images from the dataset with filenames based on metadata
for i in range(10):
    example = train_ds[i]
    image = example["image"]
    question_type = sanitize_filename(example["question_type"])
    question = sanitize_filename(example["question"])
    # Combine unique answers, truncating and sanitizing the string
    answers = "_".join(sorted(set([sanitize_filename(ans['answer']) for ans in example["answers"]])))
    answers = answers[:50]  # Truncate to 50 characters for safety
    answer_type = sanitize_filename(example["answer_type"])
    
    filename = f"{question_type}_{question}_{answers}_{answer_type}.jpg"
    filepath = os.path.join('saved_images', filename)
    
    image.save(filepath)

print("10 images saved successfully.")