In [1]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [2]:
import torch
from PIL import Image

class VQADataset(torch.utils.data.Dataset):

    def __init__(self, dataset_load, processor_load):
        self.dataset = dataset_load
        self.processor = processor_load

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        question = self.dataset[idx]['question']
        answer = self.dataset[idx]['answer']
        image_id = self.dataset[idx]['image_id']
        image_path = self.dataset[idx]['image_path']
        image = Image.open(image_path).convert("RGB")
        text = question

        """Model Prediction"""

        """
        Image - Image loaded from csv file
        Text - the question loaded from csv file
        Padding - set to max length of the model - BlipProcessor set's this limit
        Truncation - If the question is too long, it cuts off extra tokens to fit max length.
        Return Tensors - Return the output as PyTorch tensors (default is list of ints).
        Return Attention Mask - Tells the model which tokens are actual input and which are padding

        Encoding returns
        {pixel_values:, input_ids:, attention_mask}
        pixel_values - raw pixel values of imafe
        input_ids - input model
        attention_mask - Returns a tensor like [1, 1, 1, 0, 0]
        """
        
        encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt", return_attention_mask=True)
        
        """Encoding our Answers"""
        labels = self.processor.tokenizer.encode(
            str(answer),
            max_length= 16,
            truncation=True,
            padding="max_length",
            return_tensors='pt',
            return_attention_mask=True
        )

        # Add labels also to the dictionary
        encoding["labels"] = labels

        # Remove extra dimension from the attention mask
        encoding["attention_mask"] = encoding["attention_mask"].squeeze()
        
        # Remove batch dimension
        for dict_key,dict_value in encoding.items():  
            encoding[dict_key] = dict_value.squeeze()
            
        return encoding

In [3]:
from transformers import BlipProcessor, BlipForQuestionAnswering

# cached_dir = os.path.join('BLIP_checkpoints')
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")#,cache_dir=cached_dir)
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base",use_fast = True)#,cache_dir=cached_dir)
torch.manual_seed(42)

2025-05-15 08:53:32.141374: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747299212.330882      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747299212.388615      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

<torch._C.Generator at 0x79392fb10610>

In [27]:
import os
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset

kagglehub.dataset_download("hlgsagar1234567/vr-go")
kagglehub.dataset_download("rajan56/datacuration")

train_df = pd.read_csv("/kaggle/input/datacuration/data_curation_train.csv")  
val_df = pd.read_csv("/kaggle/input/datacuration/data_curation_val.csv")

train_df = train_df.sample(n=512, random_state=42).reset_index(drop=True)
val_df = val_df.sample(n=128, random_state=42).reset_index(drop=True)

print(len(train_df),len(val_df))

# Convert back to Hugging Face Datasets
training_data = Dataset.from_pandas(train_df)
valid_data = Dataset.from_pandas(val_df)

training_set = VQADataset(dataset_load=training_data, processor_load=processor)
valid_set = VQADataset(dataset_load=valid_data, processor_load=processor)

512 128


In [17]:
"""Visualising data returned from Dataset, VQA Dataset """
# print(training_data.column_names)
# for idx in range(1):
#     encoding = training_set[idx]
#     print("Encoding",encoding)

'Visualising data returned from Dataset, VQA Dataset '

In [28]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
BATCH_SIZE = 128

"""
The DataLoader + collate_fn work on your processed dataset items (training_set) 

training_set[idx] calls your __getitem__ method, which:
Loads the image,
Processes it with the processor to create tensors (input_ids, pixel_values, attention_mask),
Encodes labels,
Returns a dictionary of tensors
"""

def custom_collate(batch):
    
    # Separate each element in the batch
    input_ids = [item["input_ids"] for item in batch]
    pixel_values = [item["pixel_values"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    """input_ids"""
    # Pad sequences to the maximum length in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)  # Adjust padding_value as needed

    """labels"""
    # The special value -100 is the default ignore_index in PyTorch’s CrossEntropyLoss.
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    """pixel_values"""
    """attention_mask"""
    # stacks all items -> [3, H, W] => [batch_size, 3, H, W]
    pixel_values_stacked = torch.stack(pixel_values)  
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    # Return the batch as a dictionary
    return {
        "input_ids": input_ids_padded,
        "pixel_values": pixel_values_stacked,
        "attention_mask": attention_mask_padded,
        "labels": labels_padded,
    }

# Use this collate function in your DataLoader
train_dataloader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)
valid_dataloader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate)

In [19]:
# for row in train_dataloader:
#     for k,v in row.items():
#         print(k)
#     break

In [8]:
# for name, module in model.named_modules():
#     print(name)

In [29]:
text_encoder_target_modules = [
    f"text_decoder.bert.encoder.layer.{i}.crossattention.self.query" for i in range(1, 12)
] + [
    f"text_decoder.bert.encoder.layer.{i}.crossattention.self.value" for i in range(1, 12)
]

print(text_encoder_target_modules)

['text_decoder.bert.encoder.layer.1.crossattention.self.query', 'text_decoder.bert.encoder.layer.2.crossattention.self.query', 'text_decoder.bert.encoder.layer.3.crossattention.self.query', 'text_decoder.bert.encoder.layer.4.crossattention.self.query', 'text_decoder.bert.encoder.layer.5.crossattention.self.query', 'text_decoder.bert.encoder.layer.6.crossattention.self.query', 'text_decoder.bert.encoder.layer.7.crossattention.self.query', 'text_decoder.bert.encoder.layer.8.crossattention.self.query', 'text_decoder.bert.encoder.layer.9.crossattention.self.query', 'text_decoder.bert.encoder.layer.10.crossattention.self.query', 'text_decoder.bert.encoder.layer.11.crossattention.self.query', 'text_decoder.bert.encoder.layer.1.crossattention.self.value', 'text_decoder.bert.encoder.layer.2.crossattention.self.value', 'text_decoder.bert.encoder.layer.3.crossattention.self.value', 'text_decoder.bert.encoder.layer.4.crossattention.self.value', 'text_decoder.bert.encoder.layer.5.crossattention.se

In [30]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, # rank of the low-rank matrices that LoRA uses to modify certain parts of the model
    lora_alpha=32, # scale LoRA weights, effectively controlling how strongly the LoRA layers impact the model’s predictions
    lora_dropout=0.05, #
    bias="none", # whether or not to add biases to the LoRA-modified layers
    target_modules=text_encoder_target_modules      # specifies the exact layers in the model where LoRA should be applied, typically in attention mechanisms
                                        # ["q_proj", "k_proj"] are standard as they affect the query and key projections in attention layers
)

lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()

trainable params: 270,336 || all params: 384,942,908 || trainable%: 0.0702


## Sagar's Variant

In [31]:
from huggingface_hub import HfApi
def get_latest_checkpoint_from_hf(repo_id, token=None):
    """
    Fetch the latest checkpoint folder name or path from a Hugging Face repo.

    repo_id: str, e.g. "username/modelname"
    token: Optional str, your HF token if private repo

    Returns:
    latest_checkpoint: str or None
    """

    login(token=token)
    api = HfApi()
    # List all files in the repo root (or a specific folder if you organize checkpoints)
    files = api.list_repo_files(repo_id=repo_id, token=token)

    # Filter checkpoint folders/files
    checkpoints = [f for f in files if f.startswith('checkpoint')]

    if not checkpoints:
        return None

    # Assuming checkpoints have incremental naming like checkpoint1, checkpoint2...
    # Sort them based on numeric suffix
    checkpoints.sort(key=lambda x: int(''.join(filter(str.isdigit, x))))

    return checkpoints[-1] 

def push_tracking_info_to_hub(tracking_info, repo_id, commit_message="Update tracking info"):
    with io.BytesIO() as f:
        pickle.dump(tracking_info, f)
        f.seek(0)
        api.upload_file(
            path_or_fileobj=f,
            path_in_repo="tracking_info.pkl",
            repo_id=repo_id,
            repo_type="model",
            commit_message=commit_message,
            token="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"
        )

In [37]:
!pip install bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert_score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert_score)
  

In [40]:
from tqdm import tqdm
from huggingface_hub import HfApi, login
from peft import PeftModel
from transformers import BlipForQuestionAnswering, BlipProcessor
import torch
import os
import json
from bert_score import score as bert_score

login(token="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")
api = HfApi()
REPO_ID = "adityaav80/blip-basevqa-finetuned"
# HF_TOKEN = ""

def train(model, processor, train_dataloader, valid_dataloader, num_epochs, resume_training=False, patience=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    start_epoch = 0
    min_eval_loss = float('inf')
    early_stopping_hook = 0

    scaler = torch.amp.GradScaler(device="cuda") if device.type == "cuda" else None
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    for epoch in range(start_epoch, num_epochs):
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            if device.type == "cuda":
                with torch.amp.autocast(device_type="cuda"):
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                    attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(train_dataloader)

        # Evaluation
        model.eval()
        eval_loss = 0
        examples = []
        preds_list = []
        refs_list = []

        with torch.no_grad():
            for batch in tqdm(valid_dataloader, desc=f'Validation Epoch {epoch+1}/{num_epochs}'):
                input_ids = batch['input_ids'].to(device)
                pixel_values = batch['pixel_values'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with torch.amp.autocast(device_type="cuda") if device.type == "cuda" else torch.no_grad():
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                    attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss

                    generated_ids = model.generate(
                        input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_mask,
                        max_length=20
                    )

                preds = processor.batch_decode(generated_ids, skip_special_tokens=True)
                refs = processor.batch_decode(labels, skip_special_tokens=True)

                for p, r in zip(preds, refs):
                    examples.append({'pred': p.strip(), 'ref': r.strip()})
                    preds_list.append(p.strip())
                    refs_list.append(r.strip())

                eval_loss += loss.item()

        avg_eval_loss = eval_loss / len(valid_dataloader)

        # Calculate BERTScore for entire validation set predictions vs references
        P, R, F1 = bert_score(preds_list, refs_list, lang="en", device=device)
        avg_bertscore_f1 = F1.mean().item()

        print(f"\nEpoch {epoch+1} Metrics:")
        print(f"Train Loss: {avg_epoch_loss:.4f} | Eval Loss: {avg_eval_loss:.4f}")
        print(f"BERTScore F1: {avg_bertscore_f1:.4f}")

        print("Sample predictions:")
        for i, ex in enumerate(examples[:5]):
            print(f"  {i+1}. Pred: '{ex['pred']}', Ref: '{ex['ref']}'")

        scheduler.step()

        # Save model checkpoint
        subfolder = f"epoch-{epoch+1}"
        os.makedirs(subfolder, exist_ok=True)
        model.save_pretrained(subfolder)

        api.upload_folder(
            folder_path=subfolder,
            path_in_repo=subfolder,
            repo_id=REPO_ID,
            token=HF_TOKEN
        )

        # Save metrics
        metrics_data = {
            "epoch": epoch + 1,
            "train_loss": avg_epoch_loss,
            "eval_loss": avg_eval_loss,
            "bertscore_f1": avg_bertscore_f1,
            "examples": examples[:5]
        }

        metrics_path = f"{subfolder}/metrics.json"
        with open(metrics_path, "w") as f:
            json.dump(metrics_data, f, indent=2)

        api.upload_file(
            path_or_fileobj=metrics_path,
            path_in_repo=f"{subfolder}/metrics.json",
            repo_id=REPO_ID,
            token=HF_TOKEN
        )

        print(f"Pushed model and metrics for epoch {epoch+1}")

        # Save best model
        if avg_eval_loss < min_eval_loss:
            model.push_to_hub(REPO_ID, commit_message=f"Best model at epoch {epoch+1}")
            processor.push_to_hub(REPO_ID)
            min_eval_loss = avg_eval_loss
            early_stopping_hook = 0
            print("New best model pushed.")
        else:
            early_stopping_hook += 1
            if early_stopping_hook > patience:
                print("Early stopping triggered.")
                break

In [41]:
from peft import PeftModel
NUM_EPOCHS = 30
PATIENCE = 5

# Start training
train(
    model=lora_model,
    processor=processor,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    num_epochs=NUM_EPOCHS,
    resume_training=True,
    patience=PATIENCE
)

Training Epoch 1/30: 100%|██████████| 4/4 [00:30<00:00,  7.55s/it]
Validation Epoch 1/30: 100%|██████████| 1/1 [00:14<00:00, 14.83s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1 Metrics:
Train Loss: 9.4015 | Eval Loss: 9.3733
BERTScore F1: 0.9811
Sample predictions:
  1. Pred: 'yes', Ref: 'hearts'
  2. Pred: 'green', Ref: 'blue'
  3. Pred: 'curved', Ref: 'rectangular'
  4. Pred: 'yes', Ref: 'water'
  5. Pred: 'no', Ref: 'no'


adapter_model.safetensors:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Pushed model and metrics for epoch 1


No files have been modified since last commit. Skipping to prevent empty commit.


New best model pushed.


Training Epoch 2/30: 100%|██████████| 4/4 [00:29<00:00,  7.46s/it]
Validation Epoch 2/30: 100%|██████████| 1/1 [00:14<00:00, 14.84s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 2 Metrics:
Train Loss: 9.3655 | Eval Loss: 9.3248
BERTScore F1: 0.9811
Sample predictions:
  1. Pred: '6', Ref: '4'
  2. Pred: 'yes', Ref: 'plastic'
  3. Pred: 'white', Ref: 'white'
  4. Pred: 'yes', Ref: 'blue'
  5. Pred: 'yes', Ref: 'same'


adapter_model.safetensors:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Pushed model and metrics for epoch 2


No files have been modified since last commit. Skipping to prevent empty commit.


New best model pushed.


Training Epoch 3/30: 100%|██████████| 4/4 [00:29<00:00,  7.41s/it]
Validation Epoch 3/30: 100%|██████████| 1/1 [00:14<00:00, 14.66s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 3 Metrics:
Train Loss: 9.3203 | Eval Loss: 9.2749
BERTScore F1: 0.9811
Sample predictions:
  1. Pred: 'metal', Ref: 'glass'
  2. Pred: '2', Ref: 'one'
  3. Pred: 'yes', Ref: 'white'
  4. Pred: 'green', Ref: 'green'
  5. Pred: 'curved', Ref: 'angled'


adapter_model.safetensors:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Pushed model and metrics for epoch 3


No files have been modified since last commit. Skipping to prevent empty commit.


New best model pushed.


Training Epoch 4/30: 100%|██████████| 4/4 [00:29<00:00,  7.37s/it]
Validation Epoch 4/30: 100%|██████████| 1/1 [00:14<00:00, 14.66s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 4 Metrics:
Train Loss: 9.2707 | Eval Loss: 9.2303
BERTScore F1: 0.9811
Sample predictions:
  1. Pred: 'rectangular', Ref: 'rectangular'
  2. Pred: 'no', Ref: 'sitting'
  3. Pred: 'yes', Ref: 'plastic'
  4. Pred: 'hand holding', Ref: 'front'
  5. Pred: '4', Ref: 'six'


adapter_model.safetensors:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Pushed model and metrics for epoch 4


No files have been modified since last commit. Skipping to prevent empty commit.


New best model pushed.


Training Epoch 5/30: 100%|██████████| 4/4 [00:29<00:00,  7.44s/it]
Validation Epoch 5/30: 100%|██████████| 1/1 [00:14<00:00, 14.65s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 5 Metrics:
Train Loss: 9.2270 | Eval Loss: 9.1961
BERTScore F1: 0.9810
Sample predictions:
  1. Pred: 'blue', Ref: 'blue'
  2. Pred: 'yes', Ref: 'sunset'
  3. Pred: 'no', Ref: 'vertical'
  4. Pred: 'black', Ref: 'black'
  5. Pred: 'no', Ref: 'slip'


adapter_model.safetensors:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Pushed model and metrics for epoch 5


No files have been modified since last commit. Skipping to prevent empty commit.


New best model pushed.


Training Epoch 6/30:  25%|██▌       | 1/4 [00:10<00:31, 10.57s/it]


KeyboardInterrupt: 

# CPU Variant

In [32]:
from tqdm import tqdm
from huggingface_hub import HfApi, login
import pickle


login(token="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")
repo_id = "adityaav80/blip-basevqa-finetuned"
api = HfApi()

def train(model, processor, train_dataloader, valid_dataloader, num_epochs, resume_training=False, patience=3):

    # Where to save the best models
    best_model_repo = "adityaav80/blip-basevqa-finetuned"

    # Initialising epoch
    # Initialising the lowest validation loss
    # Initialising how may times validation loss has not imporoved
    
    start_epoch = 0
    min_eval_loss = float('inf')
    early_stopping_hook = 0

    # Load the latest checkpoint if resuming training
    if resume_training:
        
        latest_checkpoint_path = get_latest_checkpoint_from_hf("adityaav80/blip-basevqa-finetuned","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")
        
        if latest_checkpoint_path:
            
            print(f"Resuming from checkpoint at {latest_checkpoint_path}")
            base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", cache_dir='BLIP_checkpoints\\')
            processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", cache_dir='BLIP_checkpoints\\')
            model = PeftModel.from_pretrained(base_model, f"https://huggingface.co/adityaav80/blip-basevqa-finetuned/{latest_checkpoint_path}")
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            
            model.to(device)

            # Set all params to non-trainable first
            for param in model.parameters():
                param.requires_grad = False

            ## Unset few parameters which have "lora"
            for name, param in model.named_parameters():
                if "lora" in name:  # Customize based on LoRA layers naming convention
                    param.requires_grad = True
            model.print_trainable_parameters()

            # get latest epoch from checkpint folder            
            start_epoch = int(latest_checkpoint_path.split('checkpoint')[-1])

    # GradScaler helps avoid numerical problems during backprop by scaling gradients.
    # This creates a gradient scaler for mixed precision training on CUDA.
    scaler = torch.amp.GradScaler("cuda")
    # Just an empty list to keep track of losses and metrics during training.
    tracking_information = []

    # set optimizer as adam with weight decay and trainable parameters
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    # this enables weight decay
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)

    ########################## backward propagation #################################3
    
    for epoch in range(start_epoch, num_epochs):

        ## track epoch_loss
        epoch_loss = 0
        model.train()


        for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}/{num_epochs}'):
            
            ## get the output from __get__item from VQADataset class
            input_ids = batch.pop('input_ids').to(model.device)
            pixel_values = batch.pop('pixel_values').to(model.device)
            attention_mask = batch.pop('attention_mask').to(model.device)
            labels = batch.pop('labels').to(model.device)

            """
            Usually we declare loss like this in the main function
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(outputs, labels)

            But HuggingFace comes with default built in loss function 
            you use .loss to access it
            """

            # clear previous iterations results before backward prop
            optimizer.zero_grad()

            with torch.amp.autocast("cuda"):
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            # for current epoch
            epoch_loss += loss.item()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

        ## total loss per batch / batch size 
        avg_epoch_loss = epoch_loss / len(train_dataloader)
        ## track information
        tracking_information.append({'epoch': epoch + 1, 'train_loss': avg_epoch_loss})

        ##########################################################################

        ############################# evaluation of model ########################

        # Validation step
        model.eval()
        eval_loss = 0
       
        with torch.no_grad():
            for batch in tqdm(valid_dataloader, desc=f'Validation Epoch {epoch+1}/{num_epochs}'):
                input_ids = batch.pop('input_ids').to(model.device)
                pixel_values = batch.pop('pixel_values').to(model.device)
                attention_mask = batch.pop('attention_mask').to(model.device)
                labels = batch.pop('labels').to(model.device)

                with torch.amp.autocast("cuda"):
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                    eval_loss += loss.item()

        avg_eval_loss = eval_loss / len(valid_dataloader)
        tracking_information[-1]['eval_loss'] = avg_eval_loss
        print(f"Epoch {epoch+1}: Training Loss = {avg_epoch_loss}, Validation Loss = {avg_eval_loss}")
        scheduler.step()

        ######################################################################

        checkpoint_repo_id = f"adityaav80/blip-basevqa-finetuned-checkpoint-epoch{epoch+1}"
        model.push_to_hub(checkpoint_repo_id, commit_message=f"Checkpoint at epoch {epoch+1}")
        print(f"Checkpoint pushed to Hugging Face Hub: {checkpoint_repo_id}")

        # Save best model
        if avg_eval_loss < min_eval_loss:
            
            model.push_to_hub(best_model_repo, commit_message=f"Epoch{epoch}")
            processor.push_to_hub(best_model_repo)
            min_eval_loss = avg_eval_loss
            early_stopping_hook = 0
            print(f"New best model pushed to Hugging Face Hub: {best_model_repo}")
                        
        else:
            
            early_stopping_hook += 1
            if early_stopping_hook > patience:
                print("Early stopping triggered.")
                break

    push_tracking_info_to_hub(tracking_information, best_model_repo, commit_message=f"Tracking info epoch {epoch+1}")

## CPU + GPU Variant 

In [34]:
from tqdm import tqdm
from huggingface_hub import HfApi, login
from peft import PeftModel
from transformers import BlipForQuestionAnswering, BlipProcessor
import torch
import os
import pickle

login(token="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")
api = HfApi()

def train(model, processor, train_dataloader, valid_dataloader, num_epochs, resume_training=False, patience=3):
    best_model_repo = "adityaav80/blip-basevqa-finetuned"

    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    start_epoch = 0
    min_eval_loss = float('inf')
    early_stopping_hook = 0

    if resume_training:
        latest_checkpoint_path = get_latest_checkpoint_from_hf(best_model_repo, "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")
        if latest_checkpoint_path:
            print(f"Resuming from checkpoint at {latest_checkpoint_path}")
            base_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base", cache_dir='BLIP_checkpoints/')
            processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", cache_dir='BLIP_checkpoints/')
            model = PeftModel.from_pretrained(base_model, f"https://huggingface.co/{best_model_repo}/{latest_checkpoint_path}")
            model.to(device)

            for param in model.parameters():
                param.requires_grad = False
            for name, param in model.named_parameters():
                if "lora" in name:
                    param.requires_grad = True
            model.print_trainable_parameters()

            start_epoch = int(latest_checkpoint_path.split('checkpoint')[-1])

    scaler = torch.amp.GradScaler(device='cuda') if device.type == "cuda" else None
    tracking_information = []

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    for epoch in range(start_epoch, num_epochs):
        model.train()
        epoch_loss = 0

        for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch+1}/{num_epochs}'):
            input_ids = batch.pop('input_ids').to(device)
            pixel_values = batch.pop('pixel_values').to(device)
            attention_mask = batch.pop('attention_mask').to(device)
            labels = batch.pop('labels').to(device)

            optimizer.zero_grad()

            if device.type == "cuda":
                with torch.amp.autocast(device_type='cuda'):
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                    attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()

            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(train_dataloader)
        tracking_information.append({'epoch': epoch + 1, 'train_loss': avg_epoch_loss})

        # Evaluation
        model.eval()
        eval_loss = 0

        with torch.no_grad():
            for batch in tqdm(valid_dataloader, desc=f'Validation Epoch {epoch+1}/{num_epochs}'):
                input_ids = batch.pop('input_ids').to(device)
                pixel_values = batch.pop('pixel_values').to(device)
                attention_mask = batch.pop('attention_mask').to(device)
                labels = batch.pop('labels').to(device)

                if device.type == "cuda":
                    with torch.amp.autocast(device_type='cuda'):
                        outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                        attention_mask=attention_mask, labels=labels)
                        loss = outputs.loss
                else:
                    outputs = model(input_ids=input_ids, pixel_values=pixel_values,
                                    attention_mask=attention_mask, labels=labels)
                    loss = outputs.loss

                eval_loss += loss.item()

        avg_eval_loss = eval_loss / len(valid_dataloader)
        tracking_information[-1]['eval_loss'] = avg_eval_loss
        print(f"Epoch {epoch+1}: Training Loss = {avg_epoch_loss}, Validation Loss = {avg_eval_loss}")
        scheduler.step()

        checkpoint_repo_id = f"adityaav80/blip-basevqa-finetuned-checkpoint-epoch{epoch+1}"
        model.push_to_hub(checkpoint_repo_id, commit_message=f"Checkpoint at epoch {epoch+1}")
        print(f"Checkpoint pushed to Hugging Face Hub: {checkpoint_repo_id}")

        if avg_eval_loss < min_eval_loss:
            model.push_to_hub(best_model_repo, commit_message=f"Epoch{epoch}")
            processor.push_to_hub(best_model_repo)
            min_eval_loss = avg_eval_loss
            early_stopping_hook = 0
            print(f"New best model pushed to Hugging Face Hub: {best_model_repo}")
        else:
            early_stopping_hook += 1
            if early_stopping_hook > patience:
                print("Early stopping triggered.")
                break

    push_tracking_info_to_hub(tracking_information, best_model_repo, commit_message=f"Tracking info epoch {epoch+1}")

In [38]:
# from peft import PeftModel
# NUM_EPOCHS = 20
# PATIENCE = 3

# # Start training
# train(
#     model=lora_model,
#     processor=processor,
#     train_dataloader=train_dataloader,
#     valid_dataloader=valid_dataloader,
#     num_epochs=NUM_EPOCHS,
#     resume_training=True,
#     patience=PATIENCE
# )