## Finetune PaliGemma for Object Detection 

#### CREATE PALIGEMMA STYLE DATASET

In [24]:
import torch

In [1]:
from pg_datasets import coco_to_xyxy, show_random_elements, string2list, resize_bbox, coco_to_xyxy, convert_to_detection_string

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset

xray_dataset = load_dataset('dmusingu/object-detection-chest-x-ray')

In [4]:
train_dataset = xray_dataset['train']
valid_dataset = xray_dataset['validation']
test_dataset = xray_dataset['test']

In [5]:
show_random_elements(train_dataset.remove_columns(['image']), num_examples=2)

Unnamed: 0,id,image_id,category_id,bbox,area,segmentation,iscrowd,license,height,width
0,131,124,ICD,"[663, 258, 175, 145]",25375.0,[],0,1,1024,1024
1,542,510,Vascular Port,"[713, 0, 112.5, 85]",9562.5,[],0,1,1024,1024


In [6]:
train_dataset = train_dataset.map(lambda x: {'bbox': string2list(x['bbox'])})
valid_dataset = valid_dataset.map(lambda x: {'bbox': string2list(x['bbox'])})
test_dataset = test_dataset.map(lambda x: {'bbox': string2list(x['bbox'])})

In [7]:
train_dataset = train_dataset.map(resize_bbox)
valid_dataset = valid_dataset.map(resize_bbox)
test_dataset = test_dataset.map(resize_bbox)

In [8]:
train_dataset = train_dataset.map(coco_to_xyxy)
valid_dataset = valid_dataset.map(coco_to_xyxy)
test_dataset = test_dataset.map(coco_to_xyxy)

In [9]:
train_dataset = train_dataset.map(convert_to_detection_string)
valid_dataset = valid_dataset.map(convert_to_detection_string)
test_dataset = test_dataset.map(convert_to_detection_string)

In [10]:
print(train_dataset)

Dataset({
    features: ['image', 'id', 'image_id', 'category_id', 'bbox', 'area', 'segmentation', 'iscrowd', 'license', 'height', 'width', 'resized_bbox', 'xyxy', 'suffix'],
    num_rows: 736
})


In [11]:
# create a paligemma prefix
dataset_objects = list(set(train_dataset['category_id']))
dataset_objects

['Shoulder Endoprosthesis', 'Necklace', 'ICD', 'Vascular Port']

In [12]:
prefix = 'detect ' + "; ".join(dataset_objects)
prefix

'detect Shoulder Endoprosthesis; Necklace; ICD; Vascular Port'

In [13]:
train_prefix = [prefix] * len(train_dataset)
valid_prefix = [prefix] * len(valid_dataset)
test_prefix = [prefix] * len(test_dataset)

In [14]:
train_dataset = train_dataset.add_column('prefix', train_prefix)
valid_dataset = valid_dataset.add_column('prefix', valid_prefix)
test_dataset = test_dataset.add_column('prefix', test_prefix)

In [15]:
show_random_elements(train_dataset.remove_columns(['image']), num_examples=2)

Unnamed: 0,id,image_id,category_id,bbox,area,segmentation,iscrowd,license,height,width,resized_bbox,xyxy,suffix,prefix
0,113,107,Vascular Port,"[690.0, 280.0, 82.5, 77.5]",6393.75,[],0,1,1024,1024,"[150, 61, 18, 16]","[150, 61, 168, 77]",<loc0061><loc0150><loc0077><loc0168> Vascular Port,detect Shoulder Endoprosthesis; Necklace; ICD; Vascular Port
1,241,230,Vascular Port,"[333.0, 413.0, 75.0, 107.5]",8062.5,[],0,1,1024,1024,"[72, 90, 16, 23]","[72, 90, 88, 113]",<loc0090><loc0072><loc0113><loc0088> Vascular Port,detect Shoulder Endoprosthesis; Necklace; ICD; Vascular Port


### Pytorch Dataset and DataLoader

In [16]:
from pg_datasets import ObjectDetectionDataset
from torch.utils.data import DataLoader

In [17]:
pg_train_dataset = ObjectDetectionDataset(train_dataset)
pg_valid_dataset = ObjectDetectionDataset(valid_dataset)
pg_test_dataset = ObjectDetectionDataset(test_dataset)

In [18]:
pg_train_dataloader = DataLoader(pg_train_dataset, batch_size=2, shuffle=True, collate_fn=pg_train_dataset.collate_fn)
pg_valid_dataloader = DataLoader(pg_valid_dataset, batch_size=2, shuffle=False, collate_fn=pg_valid_dataset.collate_fn)
pg_test_dataloader = DataLoader(pg_test_dataset, batch_size=2, shuffle=False, collate_fn=pg_test_dataset.collate_fn)

In [19]:
train_sample = next(iter(pg_train_dataloader))

In [20]:
from train import train

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
from transformers import PaliGemmaForConditionalGeneration

In [23]:
model = PaliGemmaForConditionalGeneration.from_pretrained(
    "google/paligemma-3b-pt-224"
)

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [25]:
import random
# Select 5 images from the validation set for logging
log_indices = random.sample(range(len(valid_dataset)), 5)
log_indices

[3, 83, 28, 20, 18]

In [26]:
train_config = dict(
    num_epochs = 10,
    eval_interval = 45,  # Evaluate every 'eval_interval' steps
    loss_scaling_factor = 1000.0,  # Variable to scale the loss by a certain amount
    save_dir = '../models',
    accumulation_steps = 8,  # Accumulate gradients over this many steps
    optimizer = "AdamW",
    num_log_samples = 10,
    learning_rate = 3e-5,
    model_id = 'google/paligemma-3b-pt-224',
    model_dtype = torch.bfloat16,
    model_revision = "bfloat16"
)

In [27]:
from model import processor

In [28]:
tokenizer = processor.tokenizer

In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=train_config['learning_rate'])

In [None]:
train(
    model,
    pg_train_dataloader,
    pg_valid_dataloader,
    num_epochs=1,
    device = 'cpu',
    train_config = train_config,
    optimizer = optimizer,
    log_indices = log_indices,
    tokenizer = tokenizer
)