### Importing Dependencies

In [2]:
# !pip install pandas datasets matplotlib pillow opencv-python numpy albumentations transformers torch peft



In [3]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
from PIL import Image
import json
from collections import defaultdict
import cv2
import numpy as np
import albumentations as A
from PIL import Image
import random
from transformers import BlipProcessor, BlipForConditionalGeneration, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import os

## Load and process Data

#### Import the original data from HuggingFace

In [4]:
# original data
dataset = load_dataset("philschmid/amazon-product-descriptions-vlm")
product_images = dataset['train']['image']
product_desc = dataset['train']['description']
uniq_ids = dataset['train']['Uniq Id']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/47.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1345 [00:00<?, ? examples/s]

#### Load the captions we generated using genai

In [5]:
# generated genai descriptions
id_to_description = defaultdict(list)
with open("./data.json", "r") as file:
    data = json.load(file)  # Load JSON data
id_to_description = defaultdict(list)
for key, value in data.items():
    if isinstance(value, list):  # Ensure values are lists
        id_to_description[key] = value

len(id_to_description)

1345

#### To help the model generalize better lets define augmentations to be performed for the image. Then randomly assign each augmentation to each image and description pair(we keep the original description with a non augmented image).

In [None]:
# Define augmentations with weighting (more copies = higher probability)
augmentations = [
    A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=1),
    A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=1),
    A.HorizontalFlip(p=1),
    A.GaussianBlur(blur_limit=(3, 7), p=1),
    A.RGBShift(r_shift_limit=20, g_shift_limit=20, b_shift_limit=20, p=1)
]

# Create directory for augmented images
save_dir = "augmented_images"
os.makedirs(save_dir, exist_ok=True)

# Create dataset with image paths instead of storing images in memory
new_data = []

for uid, image, original_desc in zip(uniq_ids, product_images, product_desc):
    generated_desc = id_to_description.get(uid, [])

    # Convert PIL image to numpy array
    image_np_original = np.array(image)

    descriptions = [original_desc] + generated_desc

    for i, desc in enumerate(descriptions):
        if i == 0:
            # Save original image
            augmented_image = image
        else:
            # Reset the image for each augmentation
            image_np = image_np_original.copy()

            # Apply 1 to 3 augmentations with weighted random selection
            num_augmentations = random.randint(1, 3)
            aug_choices = random.choices(augmentations, k=num_augmentations)

            for aug in aug_choices:
                image_np = aug(image=image_np)["image"]

            # Convert back to PIL Image
            augmented_image = Image.fromarray(image_np)

        # Save image to disk and store the file path
        image_filename = f"{uid}_{i}.jpg"
        image_path = os.path.join(save_dir, image_filename)
        augmented_image.save(image_path)

        # Store data with image path instead of actual image
        new_data.append({
            "Uniq Id": uid,
            "Image_Path": image_path,
            "Description": desc
        })

# Convert to DataFrame
augmented_df = pd.DataFrame(new_data)

dataset = None
product_images = None
product_desc = None
uniq_ids = None

  original_init(self, **validated_kwargs)


## Model

#### I've chosen the base version of the Salesforce Blip Image Captioning model because I have limited resources for training. Better performance could be achieved by using the larger Blip models.

In [7]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
base_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda" if torch.cuda.is_available() else "cpu")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

#### Let's see how many trainable params I've got

In [8]:
def print_trainable_parameters(model):
    """
  printing the number of trainable paramters in the model
  """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(base_model)

trainable params: 247414076 || all params: 247414076 || trainable%: 100.0


## LoRA

#### Again because I have got limited resources I won't be fully fine tuning this 247M paramter model. I've done some research into other options for fine tuning and decided to settle on LoRA for fine tuning. There is a very good explaination of it that I found here: [here](https://medium.com/@manindersingh120996/practical-guide-to-fine-tune-llms-with-lora-c835a99d7593) and another one [here](https://ai.plainenglish.io/understanding-low-rank-adaptation-lora-for-efficient-fine-tuning-of-large-language-models-082d223bb6db). The links also suggest that for fine tuning with LoRA we first must fine the modules we want to target and I've used the code provided by the first link in the cell below.

In [10]:
for name, module in base_model.named_modules():
    if 'attn' in name or 'attention' in name:  # Common attention module names
        print(name)
        for sub_name, sub_module in module.named_modules():  # Check sub-modules within attention
            print(f"  - {sub_name}")

vision_model.encoder.layers.0.self_attn
  - 
  - dropout
  - qkv
  - projection
vision_model.encoder.layers.0.self_attn.dropout
  - 
vision_model.encoder.layers.0.self_attn.qkv
  - 
vision_model.encoder.layers.0.self_attn.projection
  - 
vision_model.encoder.layers.1.self_attn
  - 
  - dropout
  - qkv
  - projection
vision_model.encoder.layers.1.self_attn.dropout
  - 
vision_model.encoder.layers.1.self_attn.qkv
  - 
vision_model.encoder.layers.1.self_attn.projection
  - 
vision_model.encoder.layers.2.self_attn
  - 
  - dropout
  - qkv
  - projection
vision_model.encoder.layers.2.self_attn.dropout
  - 
vision_model.encoder.layers.2.self_attn.qkv
  - 
vision_model.encoder.layers.2.self_attn.projection
  - 
vision_model.encoder.layers.3.self_attn
  - 
  - dropout
  - qkv
  - projection
vision_model.encoder.layers.3.self_attn.dropout
  - 
vision_model.encoder.layers.3.self_attn.qkv
  - 
vision_model.encoder.layers.3.self_attn.projection
  - 
vision_model.encoder.layers.4.self_attn
  - 
  -

In [11]:
for name, module in base_model.named_modules():
    print(name)  # This will list all module names


vision_model
vision_model.embeddings
vision_model.embeddings.patch_embedding
vision_model.encoder
vision_model.encoder.layers
vision_model.encoder.layers.0
vision_model.encoder.layers.0.self_attn
vision_model.encoder.layers.0.self_attn.dropout
vision_model.encoder.layers.0.self_attn.qkv
vision_model.encoder.layers.0.self_attn.projection
vision_model.encoder.layers.0.layer_norm1
vision_model.encoder.layers.0.mlp
vision_model.encoder.layers.0.mlp.activation_fn
vision_model.encoder.layers.0.mlp.fc1
vision_model.encoder.layers.0.mlp.fc2
vision_model.encoder.layers.0.layer_norm2
vision_model.encoder.layers.1
vision_model.encoder.layers.1.self_attn
vision_model.encoder.layers.1.self_attn.dropout
vision_model.encoder.layers.1.self_attn.qkv
vision_model.encoder.layers.1.self_attn.projection
vision_model.encoder.layers.1.layer_norm1
vision_model.encoder.layers.1.mlp
vision_model.encoder.layers.1.mlp.activation_fn
vision_model.encoder.layers.1.mlp.fc1
vision_model.encoder.layers.1.mlp.fc2
visio

#### Choices for Hyper Params for LoRA

First r=8 defines the rank of the low-rank decomposition. Lowering this would mean less computation but less effective finetuning and increasing would have the complete opposite effect. Next alpha=32 is the influence that the adapted fine tuning weights should have over the original model. Finally I've chosen these target modules for where the resulting matrices should be made. From some internet research I've found that targeting the K, Q, and V projections in attention models is a common practice as it plays a major role in model influence. And that makes sense to me as any ML course I've taken that covers self-attention seems to dig deep on K, Q, and V. I've also decided to add output.dense which is the projection layer after self-attention. Fine-tuning it can help learn better attention outputs. I will be choosing to not add the mlp.fc1 and mlp.fc2 layers as those would require quite a lot of extra compute which I do not have.

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank (trade-off between efficiency and accuracy)
    lora_alpha=32,  # Scaling factor
    target_modules = [
        "self_attn.qkv",
        "self_attn.projection",
        "attention.self.query",
        "attention.self.key",
        "attention.self.value",
        "attention.output.dense",
        "crossattention.self.query",
        "crossattention.self.key",
        "crossattention.self.value",
        "crossattention.output.dense",
    ],
    lora_dropout=0.1,
    bias="none",
)

# Convert the BLIP model to a LoRA model
lora_model = get_peft_model(base_model, lora_config)

# checkpoint_dir = "./blip-lora-finetuned/checkpoint-1060" # Example
# final_model = PeftModel.from_pretrained(base_model, checkpoint_dir, is_trainable=True)
# print(f"Successfully loaded PEFT adapter from {checkpoint_dir}")

lora_model.print_trainable_parameters()  # Check trainable parameters



trainable params: 1,622,016 || all params: 249,036,092 || trainable%: 0.6513


So we have about 1.5M trainable params which account for a mere .62% of the total params. Ofcourse we can add more target modules into this and increase these trainable parameters but I will choose not to. Maybe if I'm not unemployed soon I can spend some money for real GPUs.

## Data Loaders

#### DataLoader class

In [13]:
class AmazonProductDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.data = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image_path = item["Image_Path"]  # Get file path
        caption = item["Description"]

        # Load image from disk
        image = Image.open(image_path).convert("RGB")

        # Process image without passing the text as input
        encoding = self.processor(images=image, text='', return_tensors="pt", padding="max_length", truncation=True)
        labels = self.processor.tokenizer(caption, return_tensors="pt", padding="max_length", truncation=True)["input_ids"]

        # Replace padding token ID with -100 so they're ignored in loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100


        return {
            "pixel_values": encoding["pixel_values"].squeeze(),
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": labels.squeeze(),
        }

# Create dataset and dataloader
dataset = AmazonProductDataset(augmented_df, processor)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

## Actual Fine Tuning

#### Finally we get to the good stuff where I can actuallt run this thing and have it train

My dataset is only 9427 documents but I still believe having a validation set will help with fine tuning. So I will go with only 10% validation set instead of a standard 20%. For test I will just find images of new products online and rate the generated description myself.

In [14]:
# checkpoint_dir = "./blip-lora-finetuned/checkpoint-530" # Example
# final_model = PeftModel.from_pretrained(lora_model, checkpoint_dir)

training_args = TrainingArguments(
    output_dir="./blip-lora-finetuned",
    gradient_accumulation_steps=2,
    num_train_epochs=9,
    save_steps=530,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=530,
    logging_dir="./logs",
    logging_steps=250,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    push_to_hub=False,
    fp16=True,
    label_names=["labels"]
)

# split into validation and training
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Add evaluation dataset
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkhushin-p30[0m ([33mkhushin-p30-illinois-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
530,6.9557,6.836062
1060,6.7498,6.696607
1590,6.6223,6.602266
2120,6.4926,6.555292
2650,6.4372,6.511395
3180,6.3949,6.483099
3710,6.3426,6.468755
4240,6.3011,6.459181
4770,6.2623,6.456017


TrainOutput(global_step=4770, training_loss=6.549091732576958, metrics={'train_runtime': 9840.675, 'train_samples_per_second': 7.759, 'train_steps_per_second': 0.485, 'total_flos': 4.556128158089006e+19, 'train_loss': 6.549091732576958, 'epoch': 8.983977379830348})

In [None]:
for batch in dataloader:
    print(batch.keys())
    break

#c00799c50b987917f892a822babf481e3e84708c

dict_keys(['pixel_values', 'input_ids', 'attention_mask', 'labels'])
