# Installing Dependencies

In [None]:
!pip install -U transformers peft bitsandbytes datasets evaluate
!pip install qwen-vl-utils[decord]==0.0.8
!pip install evaluate
!pip install rouge_score
!pip install bert-score
!git clone https://github.com/neulab/BARTScore.git

In [None]:
import sys
sys.path.append('/kaggle/working/BARTScore')

# Imports

In [None]:
#  Import libraries
import os
import torch
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, random_split
from transformers import (
    AutoProcessor,
    AutoTokenizer,
    Qwen2_5_VLForConditionalGeneration,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
from qwen_vl_utils import process_vision_info
from peft import PeftModel

# PATHS

In [None]:
# Paths and model setup
CSV_PATH     = "/kaggle/input/vr-dataset-final-20k/annotations.csv"
IMAGE_FOLDER = "/kaggle/input/vr-dataset-final-20k/images/unique_images"
MODEL_ID     = "Qwen/Qwen2.5-VL-7B-Instruct"


## Loading the model

In [None]:
# Load processor & tokenizer
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID, 
    trust_remote_code=True,
    use_fast=False
)
print("→ image placeholder is:", tokenizer.additional_special_tokens[0])

# Load base model
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True
)

# Apply LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules='all-linear',
    task_type="CAUSAL_LM"
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

## Loading Dataset

In [None]:
class QwenDataset(Dataset):
    def __init__(self, csv_path, image_folder):
        self.df = pd.read_csv(csv_path)
        self.image_folder = image_folder

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row["image_name"])
        image = Image.open(image_path).convert("RGB")
        instruction = (
            "You must answer with exactly one word. "
            "Spaces between words will be treated as multiple words. "
            "Do not include any explanations or punctuation."
        )
        question = row["question"]
        full_prompt = f"{instruction}\n{question}"
        answer = row["answer"]

        text = f"<|im_start|>user\n<image>\n{full_prompt}<|im_end|>\n<|im_start|>assistant\n{answer}<|im_end|>"

        return {
            "image": image,
            "text": text,
        }


In [None]:
# Dataset loading
ds = QwenDataset(CSV_PATH, IMAGE_FOLDER)
total_len = len(ds)

# Compute sizes
train_size = int(0.8 * total_len)
val_size = int(0.1 * total_len)
test_size = total_len - train_size - val_size  # Remaining for test

# Set random seed
seed = 42
generator = torch.Generator().manual_seed(seed)

# Perform split with fixed random seed
train_ds, val_ds, test_ds = random_split(ds, [train_size, val_size, test_size], generator=generator)

# Print sizes
print(train_size)
print(val_size)
print(test_size)

# Inference

In [None]:
#Paths

LORA_WEIGHTS = "sohith18/qwen2vl-lora-vqa-7b"  # Updated to match your output dir
MODEL_ID     = "Qwen/Qwen2.5-VL-7B-Instruct"

#Load base model & LoRA weights
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    load_in_8bit=True,
    device_map="auto",
    trust_remote_code=True
)
model = base_model
model.eval()  # Set to evaluation mode

#Load processor and tokenizer
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)

In [None]:
def extract_question_and_answer(text):
    user_match = re.search(r"<\|im_start\|>user\n(?:<image>\n)?(.*?)<\|im_end\|>", text, re.DOTALL)
    raw_user = user_match.group(1).strip() if user_match else ""

    # Get only the last question sentence (assumes question is last line)
    question_lines = [line.strip() for line in raw_user.splitlines() if line.strip()]
    question = question_lines[-1] if question_lines else ""

    assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
    answer = assistant_match.group(1).strip() if assistant_match else ""

    return question, answer

def predict(image_path, question,image=None):
    if image is None:
        image = Image.open(image_path).convert("RGB")

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": question}
        ]
    }]

    # Process the messages correctly
    processed_text = processor.apply_chat_template([messages[0]], tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    # Create inputs
    inputs = processor(
        text=processed_text,
        images=image_inputs,
        videos=video_inputs,
        return_tensors="pt",
    ).to(model.device)

    # Get the length of input for extracting only new tokens
    input_len = inputs["input_ids"].shape[1]
     # Generate with proper parameters
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=20,  # Increased from 20
            do_sample=False,
            num_beams=1,         # Increased from 1
            eos_token_id=tokenizer.eos_token_id,  # Explicitly set EOS token
        )

    # Extract only the new tokens (the answer)
    generated_tokens = output[0][input_len:]

    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    
    
    return answer

## Testing with one Sample

In [None]:
# Load eval data
CSV_PATH = "/kaggle/input/vr-dataset-final-20k/annotations.csv"
IMAGE_FOLDER = "/kaggle/input/vr-dataset-final-20k/images/unique_images"
df = pd.read_csv(CSV_PATH)
df["image_path"] = df["image_name"].apply(lambda x: os.path.join(IMAGE_FOLDER, x))

In [None]:
i = 4
q,ea = extract_question_and_answer(test_ds[i]["text"])
pred_answer = predict(None,q,test_ds[i]["image"])
print(f'Question: {q}')
first_word = re.split(r'\W+', pred_answer.strip())[0]
print(f'Predicted Answer: {first_word}')
print(f'Expected Answer: {ea}')

In [None]:
import re

row = df.iloc[45]
ans = predict(row["image_path"], row["question"])

# Extract only the first word (split on non-word characters)
first_word = re.split(r'\W+', ans.strip())[0]

print("Q:", row["question"])
print("A:", first_word)
print("GT:", row["answer"])


## Evaluation

In [None]:
import re
import numpy as np
from tqdm import tqdm
import torch
from bart_score import BARTScorer
from sentence_transformers import SentenceTransformer, util
import evaluate

# Load metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
bart_scorer = BARTScorer(device=device, checkpoint='facebook/bart-large-cnn')

# Metric functions
def compute_exact_match(pred, label):
    return int(pred.strip().lower() == label.strip().lower())

def compute_token_f1(pred, label):
    pred_tokens = pred.strip().lower().split()
    label_tokens = label.strip().lower().split()
    common = set(pred_tokens) & set(label_tokens)
    if len(common) == 0:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(label_tokens)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

def compute_semantic_similarity(pred, label):
    emb_pred = embedding_model.encode(pred, convert_to_tensor=True)
    emb_label = embedding_model.encode(label, convert_to_tensor=True)
    return float(util.cos_sim(emb_pred, emb_label))


def clean_answer(predicted_answer):
    # Lowercase and extract alphabetic substrings only
    tokens = re.findall(r'[a-zA-Z]+', predicted_answer)

    if not tokens:
        return "Unknown"

    # Remove duplicates while keeping order
    seen = set()
    tokens = [t for t in tokens if not (t in seen or seen.add(t))]


    return tokens[0]


def extract_question_and_answer(text):
    user_match = re.search(r"<\|im_start\|>user\n(?:<image>\n)?(.*?)<\|im_end\|>", text, re.DOTALL)
    raw_user = user_match.group(1).strip() if user_match else ""

    # Get only the last question sentence (assumes question is last line)
    question_lines = [line.strip() for line in raw_user.splitlines() if line.strip()]
    question = question_lines[-1] if question_lines else ""

    assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
    answer = assistant_match.group(1).strip() if assistant_match else ""

    return question, answer


# Main evaluation
def evaluate_qwen(eval_dataset):
    exact_matches = []
    token_f1s = []
    rouge_scores = []
    bert_scores = []
    bart_scores = []
    semantic_similarities = []

    for sample in tqdm(eval_dataset, desc="Evaluating"):
        image = sample["image"]
        text = sample["text"]
        question, expected_answer = extract_question_and_answer(text)

        # print(question, expected_answer)

        # Get prediction
        pred_raw = predict(None,question,image)
        pred_answer = clean_answer(pred_raw)

        print(f'Question: {question}')
        print(f'Predicted Answer: {pred_answer}')
        print(f'Expected Answer: {expected_answer}')

        # Metrics
        exact_matches.append(compute_exact_match(pred_answer, expected_answer))
        token_f1s.append(compute_token_f1(pred_answer, expected_answer))

        rouge_result = rouge.compute(predictions=[pred_answer], references=[expected_answer], use_stemmer=True)
        rouge_scores.append(rouge_result["rougeL"])

        bert_result = bertscore.compute(predictions=[pred_answer], references=[expected_answer], lang="en")
        bert_scores.append(bert_result["f1"][0])

        bart_score = bart_scorer.score([pred_answer], [expected_answer])[0]
        bart_scores.append(bart_score)

        semantic_similarities.append(compute_semantic_similarity(pred_answer, expected_answer))

    # Summary
    print("\n Evaluation Metrics:")
    print(f"  - Exact Match:            {np.mean(exact_matches):.4f}")
    print(f"  - Token-level F1:         {np.mean(token_f1s):.4f}")
    print(f"  - ROUGE-L:                {np.mean(rouge_scores):.4f}")
    print(f"  - BERTScore (F1):         {np.mean(bert_scores):.4f}")
    print(f"  - BARTScore:              {np.mean(bart_scores):.4f}")
    print(f"  - Semantic Cosine Sim.:   {np.mean(semantic_similarities):.4f}")


In [None]:
from torch.utils.data import Subset

subset_500 = Subset(test_ds, range(500))
evaluate_qwen(subset_500)