In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading vqa dataset,inferencing on BLIP2-FLAN-T5-XL and calculating metrics: 
1. exact_match_accuracy
2. relaxed_accuracy
3. word_overlap

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

os.makedirs("/kaggle/working/results", exist_ok=True)

print("Loading model...")
model_name = "Salesforce/blip2-flan-t5-xl" 
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.to(device)
print("Model loaded successfully")

def load_kaggle_vqa_dataset():
    try:
        csv_path = "/kaggle/input/vqa-dataset/vqa_dataset_gemini_final.csv"
        print(f"Loading dataset from {csv_path}")
        df = pd.read_csv(csv_path)
        print(f"Dataset loaded successfully with {len(df)} samples")
        print(f"Columns: {df.columns.tolist()}")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def run_inference(image_path, question):
    try:
        image = Image.open(image_path).convert('RGB')
        inputs = processor(images=image, text=question, return_tensors="pt").to(device)
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=50,
                num_beams=5,
                min_length=1,
                do_sample=False,
            )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        return generated_text
    except Exception as e:
        print(f"Error during inference for image {image_path}: {e}")
        return ""

def evaluate_predictions(predictions, ground_truths):
    exact_matches = [pred.lower() == gt.lower() for pred, gt in zip(predictions, ground_truths)]
    accuracy = sum(exact_matches) / len(exact_matches)

    relaxed_matches = [gt.lower() in pred.lower() or pred.lower() in gt.lower() 
                       for pred, gt in zip(predictions, ground_truths)]
    relaxed_accuracy = sum(relaxed_matches) / len(relaxed_matches)

    def word_overlap_score(pred, gt):
        if not pred or not gt:
            return 0.0
        pred_words = set(pred.lower().split())
        gt_words = set(gt.lower().split())
        return len(pred_words.intersection(gt_words)) / len(gt_words) if gt_words else 0.0

    word_overlap_scores = [word_overlap_score(pred, gt) for pred, gt in zip(predictions, ground_truths)]
    avg_word_overlap = sum(word_overlap_scores) / len(word_overlap_scores)

    return {
        "exact_match_accuracy": accuracy,
        "relaxed_accuracy": relaxed_accuracy,
        "word_overlap": avg_word_overlap
    }

def run_progressive_baseline_evaluation(sample_sizes=[3000, 7000, 'full']):
    start_time_total = time.time()
    full_df = load_kaggle_vqa_dataset()
    if full_df is None:
        print("Failed to load dataset. Exiting.")
        return

    total_size = len(full_df)
    actual_sample_sizes = [size if size != 'full' else total_size for size in sample_sizes]
    actual_sample_sizes = sorted([min(size, total_size) for size in actual_sample_sizes if isinstance(size, (int, float)) or size == total_size])

    all_metrics = {}
    all_sample_times = {}
    cumulative_results = pd.DataFrame(columns=['image_id', 'question', 'ground_truth', 'prediction', 'sample_size'])

    for sample_size in actual_sample_sizes:
        sample_name = 'full' if sample_size == total_size else str(sample_size)
        print(f"\n{'='*50}\nProcessing sample size: {sample_name} ({sample_size} samples)\n{'='*50}")
        start_time = time.time()

        current_df = full_df.sample(sample_size, random_state=42) if sample_size < total_size else full_df

        predictions, ground_truths, image_ids, questions = [], [], [], []

        print(f"Running inference on {sample_size} samples...")
        for idx, row in tqdm(current_df.iterrows(), total=len(current_df)):
            image_path = row['path']
            question = row['generated_question']
            ground_truth = row['generated_answer']

            if not image_path.startswith('/kaggle'):
                image_path = os.path.join('/kaggle/input/vqa-dataset/req-images/', image_path)

            if not isinstance(question, str) or not isinstance(ground_truth, str):
                continue

            prediction = run_inference(image_path, question)
            predictions.append(prediction)
            ground_truths.append(ground_truth)
            image_ids.append(os.path.basename(image_path))
            questions.append(question)

        print(f"Evaluating predictions for sample size {sample_name}...")
        metrics = evaluate_predictions(predictions, ground_truths)
        all_metrics[sample_size] = metrics
        elapsed_time = time.time() - start_time
        all_sample_times[sample_size] = elapsed_time

        print(f"\nResults for sample size {sample_name}:")
        for metric_name, metric_value in metrics.items():
            print(f"{metric_name}: {metric_value:.4f}")
        print(f"Time taken: {elapsed_time:.2f} seconds")

        results_df = pd.DataFrame({
            'image_id': image_ids,
            'question': questions,
            'ground_truth': ground_truths,
            'prediction': predictions,
            'sample_size': [sample_name] * len(predictions)
        })

        cumulative_results = pd.concat([cumulative_results, results_df], ignore_index=True)
        sample_results_path = f"/kaggle/working/results/blip2_baseline_results_{sample_name}.csv"
        results_df.to_csv(sample_results_path, index=False)
        print(f"Results for sample size {sample_name} saved to {sample_results_path}")

    metrics_df = pd.DataFrame([all_metrics[size] for size in actual_sample_sizes], 
                              index=[str(size) if size != total_size else 'full' for size in actual_sample_sizes])
    metrics_df['sample_size'] = [str(size) if size != total_size else 'full' for size in actual_sample_sizes]
    metrics_df['time_seconds'] = [all_sample_times[size] for size in actual_sample_sizes]

    metrics_path = "/kaggle/working/results/blip2_baseline_all_metrics.csv"
    metrics_df.to_csv(metrics_path)
    print(f"\nAll metrics saved to {metrics_path}")

    all_results_path = "/kaggle/working/results/blip2_baseline_all_results.csv"
    cumulative_results.to_csv(all_results_path, index=False)
    print(f"All results saved to {all_results_path}")

    plot_metrics(all_metrics, actual_sample_sizes, total_size)

    total_time = time.time() - start_time_total
    print(f"\nTotal evaluation time: {total_time:.2f} seconds")
    return all_metrics, cumulative_results

def plot_metrics(all_metrics, sample_sizes, total_size):
    plt.figure(figsize=(12, 8))
    metric_names = list(next(iter(all_metrics.values())).keys())
    x_labels = [str(size) if size != total_size else 'full' for size in sample_sizes]

    for metric in metric_names:
        metric_values = [all_metrics[size][metric] for size in sample_sizes]
        plt.plot(x_labels, metric_values, marker='o', linewidth=2, label=metric)

    plt.xlabel('Sample Size')
    plt.ylabel('Score')
    plt.title('BLIP-2 Performance Metrics Across Sample Sizes')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend()
    plt.tight_layout()
    plot_path = "/kaggle/working/results/blip2_metrics_by_sample_size.png"
    plt.savefig(plot_path)
    print(f"Metrics plot saved to {plot_path}")

# Execute
if __name__ == "__main__":
    print("Starting baseline evaluation...")
    sample_sizes = [3000, 7000, 10000, 'full']
    all_metrics, all_results = run_progressive_baseline_evaluation(sample_sizes=sample_sizes)
    print("Progressive baseline evaluation completed!")


Using device: cuda
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully
Starting baseline evaluation...
Loading dataset from /kaggle/input/vqa-dataset/vqa_dataset_gemini_final.csv
Dataset loaded successfully with 19497 samples
Columns: ['path', 'generated_question', 'generated_answer']

Processing sample size: 3000 (3000 samples)
Running inference on 3000 samples...


100%|██████████| 3000/3000 [1:25:00<00:00,  1.70s/it]


Evaluating predictions for sample size 3000...

Results for sample size 3000:
exact_match_accuracy: 0.0140
relaxed_accuracy: 0.2465
word_overlap: 0.2220
Time taken: 5100.58 seconds
Results for sample size 3000 saved to /kaggle/working/results/blip2_baseline_results_3000.csv

Processing sample size: 7000 (7000 samples)
Running inference on 7000 samples...


100%|██████████| 7000/7000 [3:18:01<00:00,  1.70s/it]  


Evaluating predictions for sample size 7000...

Results for sample size 7000:
exact_match_accuracy: 0.0150
relaxed_accuracy: 0.2462
word_overlap: 0.2237
Time taken: 11881.64 seconds
Results for sample size 7000 saved to /kaggle/working/results/blip2_baseline_results_7000.csv

Processing sample size: 10000 (10000 samples)
Running inference on 10000 samples...


 85%|████████▌ | 8531/10000 [4:00:09<53:00,  2.17s/it]  