# Installing Dependencies

In [None]:
!pip install -U transformers accelerate peft
!pip install -U bitsandbytes
!pip install -U evaluate
!pip install -U rouge_score
!pip install bert-score
!git clone https://github.com/neulab/BARTScore.git

In [None]:
import wandb
wandb.login(key="YOUR_KEY")

In [None]:
import os

os.environ["WANDB_PROJECT"] = "blip-lora-final-finetune"
os.environ["WANDB_LOG_MODEL"] = "false"

# Imports

In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering
import torch
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os
from peft import LoraConfig, get_peft_model,PeftModel
from transformers import TrainingArguments, Trainer
import os
from torch.utils.data import Dataset,DataLoader 
from evaluate import load
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
image_folder = "/kaggle/input/vr-dataset-final-20k/images/unique_images"  
csv_path = "/kaggle/input/vr-dataset-final-20k/annotations.csv"  

## Loading the models

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

In [None]:
model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules="all-linear",
    task_type = "QUESTION_ANS"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

## Loading the dataset

In [None]:
class VQADataset(Dataset):
    def __init__(self, csv_path, image_folder, processor, max_samples=None):
        self.data = pd.read_csv(csv_path)

        if max_samples is not None:
            self.data = self.data[:max_samples]  # Take only the first max_samples rows

        self.image_folder = image_folder
        self.processor = processor

        print(f"[INFO] Loaded {len(self.data)} samples from '{csv_path}'")
        print(f"[INFO] Image folder: {image_folder}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image_name'])
        image = Image.open(image_path).convert("RGB")
    
        question = row['question']
        answer = row['answer']
    
        # Prepare inputs (question + image)
        inputs = self.processor(images=image, text=question, return_tensors="pt", 
                                padding="max_length", truncation=True, max_length=128)
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
    
        # Tokenize answer with padding and truncation
        tokenized = self.processor.tokenizer(
            answer,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=10
        )
    
        input_ids = tokenized["input_ids"].squeeze(0)
        labels = input_ids
        inputs["labels"] = labels

        # Add metadata for later reference
        inputs["image_name"] = row["image_name"]
        inputs["question"] = question
        inputs["answer"] = answer
    
        return inputs



# Load dataset
full_dataset = VQADataset(csv_path, image_folder, processor)

In [None]:
train_ratio = 0.8
val_ratio = 0.075
test_ratio = 0.125

# Total size of the dataset
total_size = len(full_dataset)

# Compute sizes
train_size = int(train_ratio * total_size)
val_size = int(val_ratio * total_size)
test_size = total_size - train_size - val_size  # Ensures total sums to len(full_dataset)

# Perform the split
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    full_dataset, [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)  # for reproducibility
)

In [None]:
print(test_dataset[0])

# Training

In [None]:
training_args = TrainingArguments(
    output_dir="./blip-vqa-lora-final-2",
    report_to="wandb",                      # Log to Weights & Biases
    run_name="blip-vqa-final-finetune",        #Custom run name
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs",
    logging_strategy="steps",         # Log every 'logging_steps'
    logging_steps=1000,                # Log every 500 steps
    eval_strategy="steps",      # Evaluate every 'eval_steps'
    eval_steps=1000,                   # Evaluate every 500 steps
    save_strategy="steps",            # Save checkpoint every 'save_steps'
    save_steps=1000,                   # Save every 500 steps
    save_total_limit=2,               # Retain only the 2 most recent checkpoints
    load_best_model_at_end=True,      # Load the best model at the end of training
    metric_for_best_model="eval_loss",# Use evaluation loss to determine the best model
    greater_is_better=False,          # Lower eval_loss indicates a better model
    label_names=["labels"]            # Specify the correct label name
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    tokenizer=processor.tokenizer
)


# Train
trainer.train() # if resume_checkpoint has to be used comment it

In [None]:
trainer.train(resume_from_checkpoint="/kaggle/working/blip-vqa-lora-final-1/checkpoint-8000")

In [None]:
best_model_path = trainer.state.best_model_checkpoint
print(f"Best model saved at: {best_model_path}")

# Pushing to Hugging Face

In [None]:
from huggingface_hub import login, HfApi

login(token="YOUR_TOKEN")

In [None]:
from huggingface_hub import HfApi

username = "sohith18"  
repo_name = "blip-lora-vqa"
repo_id = f"{username}/{repo_name}"

api = HfApi()
api.create_repo(repo_id=repo_id, repo_type="model", private=False)


In [None]:
from huggingface_hub import upload_folder

username = "sohith18" 
repo_name = "blip-lora-vqa"
repo_id = f"{username}/{repo_name}"

upload_folder(
    repo_id=repo_id,  
    folder_path="/kaggle/working/blip-vqa-lora-final-1/checkpoint-8000",  #output_dir
    path_in_repo="",  # Upload root contents to the repo
    repo_type="model"
)

# Inference

In [None]:
from transformers import BlipForQuestionAnswering, BlipProcessor
from peft import PeftModel
import torch

lora_path = "sohith18/blip-lora-vqa"

#Load BLIP base model in 8-bit with fp16 and auto device mapping
base_model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    device_map="auto",
    load_in_8bit=True,
    torch_dtype=torch.float16
)

# Load the processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

#Load LoRA weights into the base model
model = PeftModel.from_pretrained(base_model, lora_path)

# Set the model to evaluation mode
model.eval()


In [None]:
import sys
sys.path.append('/kaggle/working/BARTScore')

In [None]:
import re
import numpy as np
from bart_score import BARTScorer
from sentence_transformers import SentenceTransformer, util
import evaluate

# Load metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnn')

# Post-processing utilities
def clean_answer(answer):
    """Return only the first word (alphanumeric)."""
    return re.findall(r"\b\w+\b", answer.strip().lower())[0] if re.findall(r"\b\w+\b", answer.strip().lower()) else ""

# Metric functions
def compute_exact_match(pred, label):
    return int(pred == label)

def compute_token_f1(pred, label):
    pred_tokens = pred.split()
    label_tokens = label.split()
    common = set(pred_tokens) & set(label_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(label_tokens)
    return 2 * precision * recall / (precision + recall)

def compute_semantic_similarity(pred, label):
    emb_pred = embedding_model.encode(pred, convert_to_tensor=True)
    emb_label = embedding_model.encode(label, convert_to_tensor=True)
    return float(util.cos_sim(emb_pred, emb_label))

# Prediction function
def predict_blip(image_path, question):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=20,
            do_sample=False,
            num_beams=1
        )
    decoded = processor.tokenizer.decode(output[0], skip_special_tokens=True)
    return clean_answer(decoded)

# Main evaluation
def evaluate_blip(dataset, image_folder):
    exact_matches = []
    token_f1s = []
    rouge_scores = []
    bert_scores = []
    bart_scores = []
    semantic_similarities = []

    for sample in tqdm(dataset, desc="Evaluating BLIP"):
        image_name = sample["image_name"]
        question = sample["question"]
        reference = clean_answer(sample["answer"])

        image_path = os.path.join(image_folder, image_name)
        prediction = predict_blip(image_path, question)

        # Compute metrics
        exact_matches.append(compute_exact_match(prediction, reference))
        token_f1s.append(compute_token_f1(prediction, reference))
        rouge_result = rouge.compute(predictions=[prediction], references=[reference], use_stemmer=True)
        rouge_scores.append(rouge_result["rougeL"])
        bert_result = bertscore.compute(predictions=[prediction], references=[reference], lang="en")
        bert_scores.append(bert_result["f1"][0])
        bart = bart_scorer.score([prediction], [reference])[0]
        bart_scores.append(bart)
        semantic_similarities.append(compute_semantic_similarity(prediction, reference))

    # Final results
    print("\n🔍 Evaluation Metrics for BLIP:")
    print(f"  - Exact Match:            {np.mean(exact_matches):.4f}")
    print(f"  - Token-level F1:         {np.mean(token_f1s):.4f}")
    print(f"  - ROUGE-L:                {np.mean(rouge_scores):.4f}")
    print(f"  - BERTScore (F1):         {np.mean(bert_scores):.4f}")
    print(f"  - BARTScore:              {np.mean(bart_scores):.4f}")
    print(f"  - Semantic Cosine Sim.:   {np.mean(semantic_similarities):.4f}")


In [None]:
evaluate_blip(test_dataset, image_folder)