In [1]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
from torch.optim import AdamW
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import os

class CustomBlipForQA(BlipForQuestionAnswering):
    def forward(self, input_ids=None, attention_mask=None, pixel_values=None, labels=None):
        return super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            labels=labels
        )

class VQADataset(Dataset):
    def __init__(self, dataframe, processor, max_question_length=32, max_answer_length=16):
        self.data = dataframe
        self.processor = processor
        self.max_question_length = max_question_length
        self.max_answer_length = max_answer_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        image_path = row['path']
        possible_paths = [
            image_path,
            f'/kaggle/input/vqa-dataset/req-images/{image_path}',
            f'/content/{image_path}'
        ]
        valid_path = None
        for path in possible_paths:
            if os.path.exists(path):
                valid_path = path
                break

        if valid_path is None:
            return None

        try:
            image = Image.open(valid_path).convert('RGB')
            question = str(row['generated_question'])
            answer = str(row['generated_answer'])

            if not question or not answer:
                return None

            inputs = self.processor(
                images=image,
                text=question,
                return_tensors="pt",
                padding='max_length',
                max_length=self.max_question_length,
                truncation=True
            )

            labels = self.processor(
                text=answer,
                return_tensors="pt",
                padding='max_length',
                max_length=self.max_answer_length,
                truncation=True
            ).input_ids

            return {
                'pixel_values': inputs['pixel_values'].squeeze(),
                'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': labels.squeeze()
            }
        except Exception as e:
            return None

def collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if len(batch) == 0:
        return None
    try:
        pixel_values = torch.stack([item['pixel_values'] for item in batch])
        input_ids = torch.stack([item['input_ids'] for item in batch])
        attention_mask = torch.stack([item['attention_mask'] for item in batch])
        labels = torch.stack([item['labels'] for item in batch])

        return {
            'pixel_values': pixel_values,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }
    except Exception as e:
        return None

def prepare_model_for_lora_finetuning():
    model_name = "Salesforce/blip-vqa-base"
    processor = BlipProcessor.from_pretrained(model_name)
    model = CustomBlipForQA.from_pretrained(model_name)

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=8,
        lora_alpha=16,
        target_modules=["query", "value"],
        lora_dropout=0.05,
        bias="none"
    )

    peft_model = get_peft_model(model, lora_config)

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable_params} || All params: {total_params} || Trainable%: {100 * trainable_params / total_params:.2f}")

    return model, processor

def train_lora_model(model, train_dataloader, val_dataloader, device):
    optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
    model.to(device)
    num_epochs = 5
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        train_batches = 0

        for batch in train_dataloader:
            if batch is None:
                continue

            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            try:
                outputs = model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    pixel_values=batch['pixel_values'],
                    labels=batch['labels']
                )
                loss = outputs.loss
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item()
                train_batches += 1

                if train_batches % 10 == 0:
                    print(f"Batch {train_batches}, Loss: {loss.item():.4f}")

            except Exception as e:
                continue

        avg_train_loss = total_train_loss / train_batches if train_batches > 0 else float('nan')

        model.eval()
        total_val_loss = 0
        val_batches = 0

        with torch.no_grad():
            for batch in val_dataloader:
                if batch is None:
                    continue

                batch = {k: v.to(device) for k, v in batch.items()}

                try:
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        pixel_values=batch['pixel_values'],
                        labels=batch['labels']
                    )
                    total_val_loss += outputs.loss.item()
                    val_batches += 1
                except Exception as e:
                    continue

        avg_val_loss = total_val_loss / val_batches if val_batches > 0 else float('nan')

        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_lora_model.pth")
            print("Saved best model")

    return model

def main():
    try:
        df = pd.read_csv('/kaggle/input/vqa-dataset/vqa_dataset_gemini_final.csv')
        print(f"Loaded dataset with {len(df)} samples")
        print("Sample data:")
        print(df.head(2))
    except Exception as e:
        return

    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model, processor = prepare_model_for_lora_finetuning()

    train_dataset = VQADataset(train_df, processor)
    val_dataset = VQADataset(val_df, processor)

    batch_size = 4
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=True
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
        drop_last=True
    )

    trained_model = train_lora_model(model, train_dataloader, val_dataloader, device)

if __name__ == "__main__":
    main()


2025-05-14 14:15:26.592444: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747232126.807820      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747232126.869548      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded dataset with 19497 samples
Sample data:
              path               generated_question generated_answer
0  3e/3ee3a90b.jpg  What number of gloves is shown?              One
1  5c/5cb11e67.jpg          What color is the wrap?            White
Train samples: 15597, Val samples: 3900
Using device: cuda


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Trainable params: 1179648 || All params: 385852220 || Trainable%: 0.31
Creating datasets...
Starting training...


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Batch 10, Loss: 9.3219
Batch 20, Loss: 9.2010
Batch 30, Loss: 9.2777
Batch 40, Loss: 9.2000
Batch 50, Loss: 9.1469
Batch 60, Loss: 9.2427
Batch 70, Loss: 9.1234
Batch 80, Loss: 9.2403
Batch 90, Loss: 9.0841
Batch 100, Loss: 9.0874
Batch 110, Loss: 9.0692
Batch 120, Loss: 8.9103
Batch 130, Loss: 9.1197
Batch 140, Loss: 9.2367
Batch 150, Loss: 9.2934
Batch 160, Loss: 9.0676
Batch 170, Loss: 8.8157
Batch 180, Loss: 8.9982
Batch 190, Loss: 8.8460
Batch 200, Loss: 8.9853
Batch 210, Loss: 9.0709
Batch 220, Loss: 9.2572
Batch 230, Loss: 9.1972
Batch 240, Loss: 8.9696
Batch 250, Loss: 9.0361
Batch 260, Loss: 8.9619
Batch 270, Loss: 8.9578
Batch 280, Loss: 8.9898
Batch 290, Loss: 9.0627
Batch 300, Loss: 8.8937
Batch 310, Loss: 9.1775
Batch 320, Loss: 8.7917
Batch 330, Loss: 9.0272
Batch 340, Loss: 8.9503
Batch 350, Loss: 9.0004
Batch 360, Loss: 8.5959
Batch 370, Loss: 8.9515
Batch 380, Loss: 8.9489
Batch 390, Loss: 8.8279
Batch 400, Loss: 8.8938
Batch 410, Loss: 8.7418
Batch 420, Loss: 8.9903
B

In [2]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import LoraConfig, get_peft_model, TaskType

adapter_weights = torch.load("/kaggle/working/best_lora_model.pth")
# print("Keys in the saved state dict:")
# for key in adapter_weights.keys():
#     print(f"- {key}")
# print("\n")

model_name = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_name)
base_model = BlipForQuestionAnswering.from_pretrained(model_name)

try:
    print("Attempting to load weights directly into the base model...")

    filtered_weights = {k: v for k, v in adapter_weights.items() if k in base_model.state_dict()}
    missing = set(base_model.state_dict().keys()) - set(filtered_weights.keys())
    unexpected = set(filtered_weights.keys()) - set(base_model.state_dict().keys())
    
    print(f"Keys in base model: {len(base_model.state_dict())}")
    print(f"Filtered keys from weights: {len(filtered_weights)}")
    print(f"Missing keys: {len(missing)}")
    print(f"Unexpected keys: {len(unexpected)}")
    
    
    base_model.load_state_dict(filtered_weights, strict=False)
    print("Direct loading completed with partial weights")
    
    
    merged_model = base_model
    merged_model.eval()
    
except Exception as e:
    print(f"Direct loading failed: {e}")
    
    
    try:
        print("\nAttempting alternative loading approach...")
        
        
        if all('base_model' not in k for k in adapter_weights.keys()):
            adapted_weights = {"base_model." + k: v for k, v in adapter_weights.items()}
            print(f"Adapted {len(adapted_weights)} keys with 'base_model' prefix")
            
            lora_config = LoraConfig(
                task_type=TaskType.CAUSAL_LM,
                r=4,
                lora_alpha=8,
                target_modules=["query", "value"],
                lora_dropout=0.05,
                bias="none"
            )
            
            peft_model = get_peft_model(base_model, lora_config)
            
            peft_model.load_state_dict(adapted_weights, strict=False)
            print("Loaded adapted weights into PEFT model")
            
            merged_model = peft_model.merge_and_unload()
            merged_model.eval()
            
        else:
            print("Checking if weights contain only LoRA adapter parameters...")
            lora_keys = [k for k in adapter_weights.keys() if 'lora' in k.lower()]
            
            if len(lora_keys) > 0:
                print(f"Found {len(lora_keys)} LoRA-related keys")
                
                lora_config = LoraConfig(
                    task_type=TaskType.CAUSAL_LM,
                    r=4,
                    lora_alpha=8,
                    target_modules=["query", "value"],
                    lora_dropout=0.05,
                    bias="none"
                )
                
                peft_model = get_peft_model(base_model, lora_config)
                
                loading_result = peft_model.load_state_dict(adapter_weights, strict=False)
                print(f"Loaded with missing keys: {len(loading_result.missing_keys)}")
                print(f"Loaded with unexpected keys: {len(loading_result.unexpected_keys)}")
                
                merged_model = peft_model.merge_and_unload()
                merged_model.eval()
            else:
                print("No LoRA-specific parameters found in weights file")
                merged_model = base_model
                merged_model.eval()
    
    except Exception as e:
        print(f"Alternative loading approach failed: {e}")
        print("\nFallback to using just the base model without adapter weights")
        merged_model = base_model
        merged_model.eval()

print("\nRunning a simple inference test to verify model is functional...")
try:
    from PIL import Image
    import numpy as np
    
    dummy_image = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
    dummy_question = "What is in this image?"
    
    inputs = processor(images=dummy_image, text=dummy_question, return_tensors="pt")
    
    with torch.no_grad():
        generated_ids = merged_model.generate(**inputs)
        answer = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    
    print(f"Test inference result: '{answer}'")
    print("Model is functional!")
    
except Exception as e:
    print(f"Test inference failed: {e}")
    print("There may be issues with the model")

print("\nSaving the final model...")
try:
    merged_model.save_pretrained("/kaggle/working/final_blip_model")
    processor.save_pretrained("/kaggle/working/final_blip_model")
    print("Model saved successfully")
except Exception as e:
    print(f"Error saving model: {e}")

  adapter_weights = torch.load("/kaggle/working/best_lora_model.pth")


Attempting to load weights directly into the base model...
Keys in base model: 789
Filtered keys from weights: 597
Missing keys: 192
Unexpected keys: 0
Direct loading completed with partial weights

Running a simple inference test to verify model is functional...
Test inference result: 'cat'
Model is functional!

Saving the final model...
Model saved successfully


In [3]:
from PIL import Image
import requests
from transformers import BlipProcessor

image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")

question = "How many cat are there?"

inputs = processor(image, question, return_tensors="pt")

merged_model.eval()
with torch.no_grad():
    generated_ids = merged_model.generate(**inputs, max_length=20)
    answer = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f"Q: {question}")
print(f"A: {answer}")


Q: How many cat are there?
A: 2


In [4]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import os

CSV_PATH = "/kaggle/input/test-vqa/vqa_dataset_gemini_checkpoint_26.csv"
IMAGE_ROOT = "/kaggle/input/vqa-dataset/req-images"  # adjust to point to directory containing val2017/
MODEL_PATH = "/kaggle/working/final_blip_model"

df = pd.read_csv(CSV_PATH)

processor = BlipProcessor.from_pretrained(MODEL_PATH)
merged_model = BlipForQuestionAnswering.from_pretrained(MODEL_PATH).eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
merged_model.to(device)

def normalize_answer(ans):
    if ans != ans:
        return "unknown"
    
    if isinstance(ans, (int, float)):
        if float(ans).is_integer():
            return str(int(ans))
        else:
            return str(ans).strip().lower()
    
    return str(ans).strip().lower()

predictions = []
references = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        image_path = os.path.join(IMAGE_ROOT, row["path"])
        image = Image.open(image_path).convert("RGB")
        question = row["generated_question"]
        gt_answer = row["generated_answer"]
        
        inputs = processor(image, question, return_tensors="pt").to(device)
        
        with torch.no_grad():
            generated_ids = merged_model.generate(**inputs, max_length=20)
            pred_answer = processor.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        predictions.append(normalize_answer(pred_answer))
        references.append(normalize_answer(gt_answer))

    except Exception as e:
        predictions.append("error")
        references.append("error")
        print(f"⚠️ Error at row {i}: {e}")

valid = [i for i in range(len(predictions)) if predictions[i] != "error"]
accuracy = accuracy_score([references[i] for i in valid], [predictions[i] for i in valid])
print(f"\n✅ Inference complete on {len(valid)} valid samples")
print(f"🎯 Exact Match Accuracy: {accuracy * 100:.2f}%")

df["model_answer"] = predictions
df.to_csv("/kaggle/working/vqa_inference_results.csv", index=False)
print("📁 Saved results to /kaggle/working/vqa_inference_results.csv")


 95%|█████████▍| 4734/5000 [06:50<00:19, 13.69it/s]

⚠️ Error at row 4732: [Errno 2] No such file or directory: '/kaggle/input/vqa-dataset/req-images/cf/cf55470e.jpg'


100%|██████████| 5000/5000 [07:13<00:00, 11.53it/s]


✅ Inference complete on 4999 valid samples
🎯 Exact Match Accuracy: 40.85%
📁 Saved results to /kaggle/working/vqa_inference_results.csv





In [5]:
!pip install -q nltk bert_score 
# !pip install git+https://github.com/neulab/BARTScore.git


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m0:00:01[0m:02[0mmm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mmm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m0:00:01[0m:01[0mmm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/neulab/BARTScore.git
  Cloning https://github.com/neulab/BARTScore.git to /tmp/pip-req-build-9oag031r
  Running command git clone --filter=blob:none --quiet https://github.com/neulab/BARTScore.git /tmp/pip-req-build-9oag031r
  Resolved https://github.com/neulab/BARTScore.git to commit 248f511cb34ae3753fc81f7d7a945de5bfe33458
[31mERROR: git+https://github.com/neulab/BARTScore.git does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [6]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [7]:
!pip install word2number


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=801600424634606c277927130759250225c17933d77c20d6409a1aa5ffea025c
  Stored in directory: /root/.cache/pip/wheels/cd/ef/ae/073b491b14d25e2efafcffca9e16b2ee6d114ec5c643ba4f06
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [8]:
import pandas as pd
import math
from word2number import w2n
from bert_score import score as bert_score
from tqdm import tqdm
from nltk.corpus import wordnet as wn
import torch

df = pd.read_csv("/kaggle/working/vqa_inference_results.csv")

def normalize_answer(ans):
    if pd.isna(ans):
        return "unknown"
    if isinstance(ans, (int, float)):
        return str(int(ans)) if float(ans).is_integer() else str(ans)
    return str(ans).strip().lower()

def numeric_equivalent(a, b):
    try:
        a_num = w2n.word_to_num(str(a)) if isinstance(a, str) and not a.isdigit() else float(a)
        b_num = w2n.word_to_num(str(b)) if isinstance(b, str) and not b.isdigit() else float(b)
        return math.isclose(a_num, b_num, rel_tol=1e-2)
    except:
        return False

def synonym_match(word1, word2):
    word1 = normalize_answer(word1)
    word2 = normalize_answer(word2)
    if word1 == word2:
        return True
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return False
    lemmas1 = set(lemma.name() for syn in synsets1 for lemma in syn.lemmas())
    lemmas2 = set(lemma.name() for syn in synsets2 for lemma in syn.lemmas())
    return not lemmas1.isdisjoint(lemmas2)

def semantic_match_bertscore(pred, ref, threshold=0.85):
    try:
        P, R, F1 = bert_score([pred], [ref], lang="en", verbose=False)
        return F1[0].item() > threshold
    except:
        return False

def is_correct(pred, ref):
    pred = normalize_answer(pred)
    ref = normalize_answer(ref)

    if pred == ref:
        return True
    if numeric_equivalent(pred, ref):
        return True
    if synonym_match(pred, ref):
        return True
    if semantic_match_bertscore(pred, ref):
        return True
    return False

predictions = df["model_answer"].tolist()
references = df["generated_answer"].tolist()

soft_matches = []
for pred, ref in tqdm(zip(predictions, references), total=len(predictions)):
    try:
        correct = is_correct(pred, ref)
    except:
        correct = False
    soft_matches.append(correct)

df["soft_match"] = soft_matches
soft_accuracy = sum(soft_matches) / len(soft_matches)
print(f"\n✅ Soft Matching Accuracy: {soft_accuracy * 100:.2f}%")

df.to_csv("/kaggle/working/vqa_soft_eval_results.csv", index=False)
print("📁 Results with soft matching saved to /kaggle/working/vqa_soft_eval_results.csv")

  0%|          | 0/5000 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 2/5000 [00:15<10:31:30,  7.58s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 7/5000 [00:16<2:30:48,  1.81s/it] Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 8/5000 [00:17<2:16:31,  1.64s/it]Some weights of Robe


✅ Soft Matching Accuracy: 90.58%
📁 Results with soft matching saved to /kaggle/working/vqa_soft_eval_results.csv



