# Installing Dependencies

In [None]:
!pip install -U transformers accelerate peft bitsandbytes

# Imports

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
import os
import torch.nn as nn

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

## Loading the model

In [None]:
# Load processor and model
processor = AutoProcessor.from_pretrained("microsoft/git-base-vqav2")
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/git-base-vqav2",
    device_map={"": 0},
    torch_dtype=torch.float16,
    load_in_8bit=True
)
model = prepare_model_for_kbit_training(model)

# Apply LoRA to decoder (only)
config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

## Loading the dataset

In [None]:
class GitVQADataset(Dataset):
    def __init__(self, csv_path, image_folder, processor, max_samples=None):
        self.data = pd.read_csv(csv_path)
        if max_samples:
            self.data = self.data[:max_samples]
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = os.path.join(self.image_folder, row["image_name"])
        image = Image.open(image_path).convert("RGB")
        question = row["question"]
        answer = row["answer"]

        prompt = question.strip().rstrip("?") + "?"
        inputs = processor(images=image, text=prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        labels = processor.tokenizer(answer, return_tensors="pt", padding="max_length", truncation=True, max_length=10).input_ids

        inputs = {k: v.squeeze(0).to(device) for k, v in inputs.items()}  # Ensure inputs are on the same device
        labels = labels.squeeze(0).to(device)  # Ensure labels are on the same device

        # Create a new tensor instead of in-place modification
        labels = torch.where(labels == processor.tokenizer.pad_token_id, torch.tensor(-100, device=device), labels)

        inputs["labels"] = labels
        return inputs


In [None]:
# Paths
image_folder = "/kaggle/input/vr-dataset-filtered/images/images"
csv_path = "/kaggle/input/vr-dataset-filtered/cleaned_data.csv"

# Load dataset
dataset = GitVQADataset(csv_path, image_folder, processor, max_samples=1000)
train_size = int(0.8 * len(dataset))
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])


# Training

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Forward pass through the model
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss manually
        labels = inputs.get("labels")
        loss_fct = nn.CrossEntropyLoss()
        # Flatten the labels and logits to compute the loss
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Training args
training_args = TrainingArguments(
    output_dir="./git-vqa-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=2,
    logging_dir="./logs",
    logging_strategy="epoch",
    report_to="none"
)

In [None]:
# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.tokenizer
)

# Train
trainer.train()