# VilBERT

## Dependencies

In [None]:
!pip install pandas pillow torch transformers nltk evaluate sentencepiece tqdm sentence-transformers rouge_score bert_score

## Load the Dataset

In [4]:
import os
input_dir = "/kaggle/input/dataset/"

## Sample testing for pretrained VilBERT model

In [5]:
from PIL import Image
import requests
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch

# Load model and processor once
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# Example dataset
image_paths = [
    "https://storage.googleapis.com/allennlp-public-data/vqav2/baseball.jpg",
    "https://storage.googleapis.com/allennlp-public-data/vqav2/vqa-examples/kitchen.jpg",
]
questions = [
    "What game are they playing?",
    "What is the color of the flower?",
]

# Results list
results = []

# Loop through dataset
for img_path, question in zip(image_paths, questions):
    # Load image (from URL or local path)
    if img_path.startswith("http"):
        image = Image.open(requests.get(img_path, stream=True).raw)
    else:
        image = Image.open(img_path)

    # Encode and predict
    encoding = processor(image, question, return_tensors="pt")
    outputs = model(**encoding)
    pred = model.config.id2label[outputs.logits.argmax(-1).item()]

    # Save result
    results.append({
        "image": img_path,
        "question": question,
        "answer": pred
    })

# Display results
for r in results:
    print(f"Q: {r['question']} \nA: {r['answer']}\n")

2025-05-18 06:55:30.121076: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747551330.303274      98 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747551330.359076      98 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

Q: What game are they playing? 
A: baseball

Q: What is the color of the flower? 
A: orange



## Run the VilBERT pretrained model

In [None]:

import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
import torch
from nltk.translate.bleu_score import sentence_bleu

input_dir = "/content"

DATA_CSV = input_dir + '/cleaned_data.csv'
IMAGE_DIR = input_dir + '/images/images/'
# Load your CSV
df = pd.read_csv(DATA_CSV)
df.columns = ["image_path", "question", "expected_answer"]

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load model and move to GPU
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(device)

results = []
predictions = []
expected_answers = []

for idx, row in df.iterrows():
    img_path = IMAGE_DIR + row["image_path"]
    question = row["question"]
    expected = row["expected_answer"]

    image = Image.open(img_path).convert("RGB")
    encoding = processor(image, question, return_tensors="pt").to(device)

    with torch.no_grad():  # Disable gradient tracking for inference
        outputs = model(**encoding)

    pred = model.config.id2label[outputs.logits.argmax(-1).item()]

    results.append({
        "image_path": img_path,
        "question": question,
        "expected_answer": expected,
        "predicted_answer": pred
    })

    # Append answers for BLEU score calculation
    predictions.append(pred.lower().split())  # Split to word tokens for BLEU
    expected_answers.append([expected.lower().split()])  # Wrap expected in a list


# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv("vqa_predictions.csv", index=False)

## Evaluation Metrics

In [None]:
# evaluate_vqa.py

import re
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

import evaluate
from sentence_transformers import SentenceTransformer, util
from transformers import BartTokenizer, BartForConditionalGeneration

# ——— 0) Read your CSV ———
df = pd.read_csv("/content/vqa_predictions.csv")
predictions = df["predicted_answer"].fillna("").astype(str).tolist()
references  = df["expected_answer"].fillna("").astype(str).tolist()

# ——— 1) Load metrics & models ———
rouge           = evaluate.load("rouge")
bertscore       = evaluate.load("bertscore")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Prepare BART for “BARTScore”
device     = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer  = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)

def compute_bart_score(candidates, sources, batch_size=8, num_beams=4):
    """
    Rough emulation of BARTScore:
     - Treat `sources` as the encoder input
     - Treat `candidates` as the decoder target
     - Compute the average log-probability per token
    """
    scores = []
    for i in range(0, len(candidates), batch_size):
        src_batch  = sources[i : i+batch_size]
        cand_batch = candidates[i : i+batch_size]

        # tokenize encoder inputs (references)
        enc = tokenizer(src_batch,  return_tensors="pt", padding=True, truncation=True).to(device)
        # tokenize decoder targets (predictions) as labels
        labs = tokenizer(cand_batch, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        # ignore pad tokens in loss
        labs[labs == tokenizer.pad_token_id] = -100

        with torch.no_grad():
            out = bart_model(**enc, labels=labs)
        # out.loss is the average -logprob per non-ignored token
        avg_neg_logprob = out.loss.detach().cpu().item()
        # count valid tokens
        valid_tokens = (labs != -100).sum().item()
        # total logprob across the batch
        total_logprob = -avg_neg_logprob * valid_tokens
        # average logprob per token
        avg_logprob = total_logprob / valid_tokens if valid_tokens > 0 else 0.0

        # assign that same score to each example in the batch
        scores.extend([avg_logprob] * len(src_batch))

    return scores

# ——— 2) Helper metric functions ———
def compute_exact_match(pred, label):
    return int(pred.strip().lower() == label.strip().lower())

def compute_token_f1(pred, label):
    p_tok = pred.strip().lower().split()
    l_tok = label.strip().lower().split()
    common = set(p_tok) & set(l_tok)
    if not common:
        return 0.0
    prec = len(common) / len(p_tok)
    rec  = len(common) / len(l_tok)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0.0

def compute_semantic_similarity(pred, label):
    e1 = embedding_model.encode(pred,   convert_to_tensor=True)
    e2 = embedding_model.encode(label, convert_to_tensor=True)
    return float(util.cos_sim(e1, e2))

def clean_answer(ans):
    ans = re.sub(r"\b\w+:\s*", "", ans)
    ans = re.sub(r"^[^a-zA-Z]+", "", ans)
    return re.split(r"[?\n]", ans)[0].strip()

# ——— 3) Storage ———
exact_matches         = []
token_f1s             = []
rouge_scores          = []
bert_scores           = []
semantic_similarities = []

# ——— 4) Loop over your CSV data ———
for pred_raw, expected in tqdm(zip(predictions, references),
                               total=len(predictions),
                               desc="Evaluating"):
    # 1) clean
    pred = clean_answer(pred_raw)

    # 2) fallback: if cleaning emptied it, use the raw string
    if pred == "":
        pred = pred_raw.strip()

    # Exact Match & Token‑F1
    exact_matches.append(compute_exact_match(pred, expected))
    token_f1s.    append(compute_token_f1(pred, expected))

    # ROUGE‑L fmeasure
    r = rouge.compute(
        predictions=[pred],
        references=[expected],
        use_stemmer=True
    )["rougeL"]
    rouge_scores.append(r)

    # BERTScore (F1)
    b = bertscore.compute(
        predictions=[pred],
        references=[expected],
        lang="en"
    )["f1"][0]
    bert_scores.append(b)

    # Semantic Cosine
    semantic_similarities.append(compute_semantic_similarity(pred, expected))

# ——— 5) Compute BARTScore over all pairs ———
bart_scores = compute_bart_score(predictions, references, batch_size=8)
bart_avg    = np.mean(bart_scores)

# ——— 6) Print summary ———
print("\n🔍 Evaluation Metrics:")
print(f"  - Exact Match:            {np.mean(exact_matches):.4f}")
print(f"  - Token-level F1:         {np.mean(token_f1s):.4f}")
print(f"  - ROUGE-L:                {np.mean(rouge_scores):.4f}")
print(f"  - BERTScore (F1):         {np.mean(bert_scores):.4f}")
print(f"  - BARTScore (avg lm‐logp): {bart_avg:.4f}")
print(f"  - Semantic Cosine Sim.:   {np.mean(semantic_similarities):.4f}")