# Packages and token access

In [None]:
!pip install torch transformers accelerate bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import userdata
import os
from huggingface_hub import login

hf_token = userdata.get('hf_token')
os.environ["HUGGINGFACE_TOKEN"] = hf_token
login(token=hf_token)


## 2. Quantization inference testing


In [None]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig
from PIL import Image
import requests
import time
import gc
import numpy as np


# Function to benchmark inference
def benchmark_inference(model, processor, image_url, prompt="caption en", num_runs=3):
    # Load image
    response = requests.get(image_url, stream=True)
    image = Image.open(response.raw)

    # Process inputs
    prompt = "<image>" + prompt  # Add image token to beginning of prompt
    inputs = processor(text=prompt, images=image, return_tensors="pt")

    # Move to appropriate device
    device = next(model.parameters()).device
    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

    # Record input sequence length
    input_len = inputs["input_ids"].shape[-1]

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    # Warmup run
    with torch.inference_mode():
        _ = model.generate(**inputs, max_new_tokens=20, do_sample=False)

    # Benchmark runs
    times = []
    for _ in range(num_runs):
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        start_time = time.time()
        with torch.inference_mode():
            outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        times.append(time.time() - start_time)

    # Calculate average time
    avg_time = np.mean(times)

    # Decode output
    generation = outputs[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)

    # Calculate memory usage
    memory = sum(p.numel() * (p.element_size() if hasattr(p, 'element_size') else 4)
                for p in model.parameters()) / (1024 * 1024)

    # Get peak memory
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / (1024 * 1024)
    else:
        peak_memory = 0

    return {
        "time": avg_time,
        "output": decoded,
        "memory": memory,
        "peak_memory": peak_memory
    }

# Benchmark original FP32 model
def benchmark_fp32():
    print("\n=== Benchmarking FP32 Model ===")
    model_id = "google/paligemma-3b-mix-224"
    processor = AutoProcessor.from_pretrained(model_id)

    # Load model
    model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Benchmark
    test_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    results = benchmark_inference(model, processor, test_image_url)

    print(f"FP32 model size: {results['memory']:.2f} MB")
    print(f"FP32 inference time: {results['time']:.4f} seconds")
    print(f"FP32 output: {results['output']}")

    # Clean up
    del model
    gc.collect()
    torch.cuda.empty_cache()

    return results, processor

# 8-bit quantization
def quantize_8bit():
    print("\n=== Quantizing to 8-bit ===")
    model_id = "google/paligemma-3b-mix-224"
    processor = AutoProcessor.from_pretrained(model_id)

    # Configure 8-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True
    )

    # Load model with 8-bit quantization
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
    )
    model.eval()

    # Benchmark
    test_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    results = benchmark_inference(model, processor, test_image_url)

    print(f"8-bit model size: {results['memory']:.2f} MB")
    print(f"8-bit inference time: {results['time']:.4f} seconds")
    print(f"8-bit output: {results['output']}")

    # Clean up
    del model
    gc.collect()
    torch.cuda.empty_cache()

    return results

# 4-bit quantization
def quantize_4bit():
    print("\n=== Quantizing to 4-bit ===")
    model_id = "google/paligemma-3b-mix-224"
    processor = AutoProcessor.from_pretrained(model_id)

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",  # normalized float 4 (better quality than standard int4)
        bnb_4bit_use_double_quant=True,
    )

    # Load model with 4-bit quantization
    model = PaliGemmaForConditionalGeneration.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
    )
    model.eval()

    # Benchmark
    test_image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
    results = benchmark_inference(model, processor, test_image_url)

    print(f"4-bit model size: {results['memory']:.2f} MB")
    print(f"4-bit inference time: {results['time']:.4f} seconds")
    print(f"4-bit output: {results['output']}")

    # Clean up
    del model
    gc.collect()
    torch.cuda.empty_cache()

    return results

# Run complete workflow and compare results
def main():
    print("Starting PaliGemma Quantization")

    # Check if CUDA is available
    if torch.cuda.is_available():
        print(f"CUDA available: {torch.cuda.get_device_name(0)}")
    else:
        print("CUDA not available, using CPU")

    # Run FP32 benchmark
    fp32_results, _ = benchmark_fp32()

    # Quantize and benchmark 8-bit
    int8_results = quantize_8bit()

    # Quantize and benchmark 4-bit
    int4_results = quantize_4bit()

    # Print summary and comparisons
    print("\n=== Quantization Summary ===")
    print(f"FP32 model: {fp32_results['memory']:.2f} MB, {fp32_results['time']:.4f} sec")
    print(f"8-bit model: {int8_results['memory']:.2f} MB, {int8_results['time']:.4f} sec")
    print(f"4-bit model: {int4_results['memory']:.2f} MB, {int4_results['time']:.4f} sec")

    print("\n=== Size Reduction ===")
    print(f"8-bit vs FP32: {fp32_results['memory'] / int8_results['memory']:.2f}x smaller")
    print(f"4-bit vs FP32: {fp32_results['memory'] / int4_results['memory']:.2f}x smaller")

    print("\n=== Speed Comparison ===")
    print(f"8-bit vs FP32: {fp32_results['time'] / int8_results['time']:.2f}x faster")
    print(f"4-bit vs FP32: {fp32_results['time'] / int4_results['time']:.2f}x faster")

    print("\n=== Output Quality Comparison ===")
    print("FP32:", fp32_results['output'])
    print("8-bit:", int8_results['output'])
    print("4-bit:", int4_results['output'])

# Run the quantization workflow
if __name__ == "__main__":
    main()

Starting PaliGemma Quantization
CUDA available: NVIDIA L4

=== Benchmarking FP32 Model ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

FP32 model size: 11152.14 MB
FP32 inference time: 1.0818 seconds
FP32 output: In this image we can see a car on the road. In the background there is a wall,

=== Quantizing to 8-bit ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

8-bit model size: 3291.79 MB
8-bit inference time: 1.8494 seconds
8-bit output: In this image we can see a car on the road. In the background there is a wall,

=== Quantizing to 4-bit ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

4-bit model size: 2149.65 MB
4-bit inference time: 0.7919 seconds
4-bit output: In this image we can see a car on the road. In the background there is a wall,

=== Quantization Summary ===
FP32 model: 11152.14 MB, 1.0818 sec
8-bit model: 3291.79 MB, 1.8494 sec
4-bit model: 2149.65 MB, 0.7919 sec

=== Size Reduction ===
8-bit vs FP32: 3.39x smaller
4-bit vs FP32: 5.19x smaller

=== Speed Comparison ===
8-bit vs FP32: 0.58x faster
4-bit vs FP32: 1.37x faster

=== Output Quality Comparison ===
FP32: In this image we can see a car on the road. In the background there is a wall,
8-bit: In this image we can see a car on the road. In the background there is a wall,
4-bit: In this image we can see a car on the road. In the background there is a wall,


## 3. Evaluations

In [None]:
!pip install editdistance datasets -q
!pip install datasets --upgrade

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m


In [None]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig
from datasets import load_dataset
import datasets
import editdistance
from tqdm import tqdm
import time
import gc
import numpy as np


# Helper function to detect device
def detect_device():
    if torch.cuda.is_available():
        return "cuda", torch.float16
    else:
        return "cpu", torch.float32

# ANLS calculation function
def get_anls(model_answer, ground_truth_answers):
    """Calculate ANLS score between model answer and list of ground truth answers"""
    model_answer = model_answer.lower().strip()

    # OK-VQA has an array of answer strings
    scores = []
    for gt in ground_truth_answers:
        gt = gt.lower().strip()
        iou = 1 - editdistance.eval(model_answer, gt) / max(len(model_answer), len(gt))
        anls = iou if iou >= 0.5 else 0.0
        scores.append(anls)

    # Return max score across all answers
    return max(scores) if scores else 0.0

# Load model function (with appropriate quantization)
def load_model(model_type="fp32"):
    model_id = "google/paligemma-3b-mix-224"
    processor = AutoProcessor.from_pretrained(model_id)

    if model_type == "fp32":
        device, dtype = detect_device()
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=dtype,
            device_map={"": device}
        )
    elif model_type == "int8":
        # Fixed 8-bit config (removed invalid parameters)
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0  # Helps with performance
        )
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
        )
    elif model_type == "int4":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
        )
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=bnb_config,
            device_map="auto",
        )

    model.eval()
    return model, processor

# Benchmark function
def benchmark_model(model, processor, dataset, model_type):
    device = next(model.parameters()).device
    print(f"Running benchmark for {model_type} model on {device}")

    start_time = time.time()
    scores = []
    inference_times = []
    example_outputs = []
    results_by_question_type = {}
    results_by_answer_type = {}

    for i, example in enumerate(tqdm(dataset)):
        image = example["image"]
        question = example["question"]
        answers = example["answers"]
        question_type = example["question_type"]
        answer_type = example["answer_type"]

        # Format prompt with image token to avoid warning
        prompt = f"<image>{question}"

        # Process inputs
        model_inputs = processor(text=prompt, images=image, return_tensors="pt")
        model_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items()}
        input_len = model_inputs["input_ids"].shape[-1]

        # Measure inference time
        torch.cuda.synchronize() if torch.cuda.is_available() else None
        inf_start = time.time()

        with torch.inference_mode():
            generation = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
            generation = generation[0][input_len:]
            model_answer = processor.decode(generation, skip_special_tokens=True)

        torch.cuda.synchronize() if torch.cuda.is_available() else None
        inference_time = time.time() - inf_start
        inference_times.append(inference_time)

        # Calculate score
        anls = max(get_anls(model_answer, gt) for gt in answers)
        scores.append(anls)

        # Track by question and answer type
        if question_type not in results_by_question_type:
            results_by_question_type[question_type] = []
        results_by_question_type[question_type].append(anls)

        if answer_type not in results_by_answer_type:
            results_by_answer_type[answer_type] = []
        results_by_answer_type[answer_type].append(anls)

        # Save examples for later inspection
        if i < 5:  # Save first 5 examples
            example_outputs.append({
                "question": question,
                "model_answer": model_answer,
                "gt_answers": answers,
                "anls": anls
            })

    # Calculate metrics
    avg_anls = sum(scores) / len(scores) if scores else 0
    avg_time = sum(inference_times) / len(inference_times) if inference_times else 0
    total_time = time.time() - start_time

    # Memory info
    memory = sum(p.numel() * (2 if model_type=="int4" else (4 if model_type=="int8" else 16))
                for p in model.parameters()) / (8 * 1024 * 1024)

    # Print some examples
    print(f"\n=== Example outputs from {model_type} model ===")
    for i, ex in enumerate(example_outputs[:3]):
        print(f"Q: {ex['question']}")
        print(f"A (model): {ex['model_answer']}")
        print(f"A (ground truth): {ex['gt_answers'][0]}")
        print(f"ANLS: {ex['anls']:.4f}\n")

    # Add detailed breakdown by question/answer type
    breakdown = {
        "by_question_type": {qt: sum(scores)/len(scores) for qt, scores in results_by_question_type.items()},
        "by_answer_type": {at: sum(scores)/len(scores) for at, scores in results_by_answer_type.items()}
    }

    return {
        "anls": avg_anls,
        "avg_inference_time": avg_time,
        "total_time": total_time,
        "memory_gb": memory,
        "examples": example_outputs,
        "breakdown": breakdown  # Add detailed breakdown
    }

# Main benchmark function
def run_benchmarks():
    # Load a validation dataset - VQA v2 has good compatibility with vision-language models
    print("Loading dataset...")
    dataset = load_dataset("lmms-lab/OK-VQA", split="val2014[:30]")

    # Prepare structured data based on OK-VQA structure
    structured_data = []
    for item in dataset:
        structured_data.append({
            "image": item["image"],
            "question": item["question"],
            "question_type": item["question_type"],
            "answer_type": item["answer_type"],
            "answers": item["answers"],  # Already an array in OK-VQA
            "question_id": item["question_id"]
        })

    results = {}

    # Benchmark each model type separately to avoid memory issues
    for model_type in ["fp32", "int8", "int4"]:
        print(f"\n=== Benchmarking {model_type} model ===")
        try:
            model, processor = load_model(model_type)
            results[model_type] = benchmark_model(model, processor, structured_data, model_type)

            # Clean up to prevent OOM errors
            del model
            gc.collect()
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error benchmarking {model_type} model: {e}")
            results[model_type] = {"error": str(e)}

    # Print comparison summary
    print("\n=== Benchmark Summary ===")
    print("Model | ANLS Score | Avg Time (s) | Memory (GB)")
    print("------|------------|--------------|------------")
    for model_type in ["fp32", "int8", "int4"]:
        if "error" not in results[model_type]:
            print(f"{model_type} | {results[model_type]['anls']:.4f} | {results[model_type]['avg_inference_time']:.4f} | {results[model_type]['memory_gb']:.2f}")
    print("\n=== Performance by Question Type ===")
    common_types = set()
    for model_type in ["fp32", "int8", "int4"]:
        if "error" not in results[model_type]:
            common_types.update(results[model_type]["breakdown"]["by_question_type"].keys())

    for q_type in sorted(common_types):
        print(f"\nQuestion Type: {q_type}")
        for model_type in ["fp32", "int8", "int4"]:
            if "error" not in results[model_type] and q_type in results[model_type]["breakdown"]["by_question_type"]:
                score = results[model_type]["breakdown"]["by_question_type"][q_type]
                print(f"  {model_type}: {score:.4f}")


    # Quality and performance ratios
    if all(model_type in results and "error" not in results[model_type] for model_type in ["fp32", "int8", "int4"]):
        print("\n=== Performance Comparison ===")
        print(f"INT8/FP32 quality ratio: {results['int8']['anls']/results['fp32']['anls']:.4f}x")
        print(f"INT4/FP32 quality ratio: {results['int4']['anls']/results['fp32']['anls']:.4f}x")

        print(f"INT8/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int8']['avg_inference_time']:.2f}x")
        print(f"INT4/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int4']['avg_inference_time']:.2f}x")

        print(f"INT8/FP32 memory ratio: {results['fp32']['memory_gb']/results['int8']['memory_gb']:.2f}x")
        print(f"INT4/FP32 memory ratio: {results['fp32']['memory_gb']/results['int4']['memory_gb']:.2f}x")

    return results

# Execute the benchmark
run_benchmarks()

Loading dataset...

=== Benchmarking fp32 model ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running benchmark for fp32 model on cuda:0


100%|██████████| 30/30 [00:03<00:00,  9.42it/s]



=== Example outputs from fp32 model ===
Q: What sport can you use this for?
A (model): race
A (ground truth): racing
ANLS: 0.0000

Q: Name the type of plant this is?
A (model): fern
A (ground truth): vine
ANLS: 0.0000

Q: What toy is this?
A (model): teddy bear
A (ground truth): stuffed animal
ANLS: 0.0000


=== Benchmarking int8 model ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running benchmark for int8 model on cuda:0


100%|██████████| 30/30 [00:10<00:00,  2.89it/s]



=== Example outputs from int8 model ===
Q: What sport can you use this for?
A (model): race
A (ground truth): racing
ANLS: 0.0000

Q: Name the type of plant this is?
A (model): fern
A (ground truth): vine
ANLS: 0.0000

Q: What toy is this?
A (model): teddy bear
A (ground truth): stuffed animal
ANLS: 0.0000


=== Benchmarking int4 model ===


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running benchmark for int4 model on cuda:0


100%|██████████| 30/30 [00:10<00:00,  2.96it/s]



=== Example outputs from int4 model ===
Q: What sport can you use this for?
A (model): riding
A (ground truth): racing
ANLS: 0.0000

Q: Name the type of plant this is?
A (model): fern
A (ground truth): vine
ANLS: 0.0000

Q: What toy is this?
A (model): teddy bear
A (ground truth): stuffed animal
ANLS: 0.0000


=== Benchmark Summary ===
Model | ANLS Score | Avg Time (s) | Memory (GB)
------|------------|--------------|------------
fp32 | 0.0000 | 0.0981 | 5576.07
int8 | 0.0000 | 0.3372 | 1394.02
int4 | 0.0167 | 0.3290 | 411.47

=== Performance by Question Type ===

Question Type: Cooking and Food
  fp32: 0.0000
  int8: 0.0000
  int4: 0.0000

Question Type: Objects, Material and Clothing
  fp32: 0.0000
  int8: 0.0000
  int4: 0.0000

Question Type: Other
  fp32: 0.0000
  int8: 0.0000
  int4: 0.0000

Question Type: People and Everyday life
  fp32: 0.0000
  int8: 0.0000
  int4: 0.0000

Question Type: Plants and Animals
  fp32: 0.0000
  int8: 0.0000
  int4: 0.0000

Question Type: Science an

ZeroDivisionError: float division by zero

## 4. Counting evals

In [None]:
!pip install editdistance datasets -q
!pip install datasets --upgrade

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m


In [None]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig
from datasets import load_dataset
from PIL import Image
import requests
import time
from tqdm import tqdm
import gc
import numpy as np


def load_tallyqa_dataset():
    print("Loading TallyQA dataset from vikhyatk/tallyqa-test...")

    try:
        # Correctly load the dataset by properly accessing the test split
        dataset = load_dataset("vikhyatk/tallyqa-test")['test']
        print(f"Loaded dataset with {len(dataset)} items")

        structured_data = []

        # Use a smaller sample (300 items) for quicker benchmarking
        sample_size = min(300, len(dataset))

        for i in range(sample_size):
            item = dataset[i]
            if 'image' in item and 'qa' in item:
                image = item['image']
                qa_list = item['qa']

                for qa in qa_list:
                    # Take just one question per image to keep the benchmark faster
                    structured_data.append({
                        "image": image,
                        "question": qa['question'],
                        "answers": [qa['answer']],
                        "question_type": "counting",
                        "issimple": qa.get('is_simple', True),
                        "question_id": len(structured_data)
                    })
                    break  # Just use the first question per image

        print(f"Prepared {len(structured_data)} QA pairs for benchmarking")

        if len(structured_data) > 0:
            return structured_data
        else:
            print("No valid data found, using synthetic dataset")

    except Exception as e:
        print(f"Error loading TallyQA dataset: {e}")


def load_model(model_type):
    """
    Load PaliGemma model with specified quantization using BitsAndBytesConfig
    """
    print(f"Loading PaliGemma model with {model_type} precision...")

    # Base model ID
    model_id = "google/paligemma-3b-mix-224"

    # Set device to CUDA if available
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # Configure based on quantization type
    if model_type == "fp32" or model_type == "bfloat16":
        # Use bfloat16 as recommended by the authors for best performance
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map=device,
            revision="bfloat16"  # Using the bfloat16 specific revision
        ).eval()
    elif model_type == "int8":
        # Load with 8-bit quantization
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map=device,
            torch_dtype=torch.bfloat16  # Still use bfloat16 as base dtype
        ).eval()
    elif model_type == "int4":
        # Load with 4-bit quantization
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
        )
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map=device
        ).eval()
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    # Load processor
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor

def evaluate_counting_answer(model_answer, ground_truth):
    """
    Evaluate counting question responses by extracting numbers
    """
    import re

    # Convert ground truth to integer
    gt_count = int(ground_truth[0])

    # Extract numbers from model answer
    numbers = re.findall(r'\b\d+\b', model_answer)

    if not numbers:
        return 0.0  # No number found in response

    # Use the first number found in the response
    try:
        predicted_count = int(numbers[0])

        # Calculate accuracy based on exact match or threshold
        if predicted_count == gt_count:
            return 1.0  # Exact match
        else:
            # Alternative: threshold-based score for close answers
            error = abs(predicted_count - gt_count)
            if gt_count > 0:
                relative_error = error / gt_count
                if relative_error <= 0.1:  # Within 10% error
                    return 0.5

            # For small counts, allow off-by-one
            if gt_count <= 10 and error == 1:
                return 0.5  # Close answer (off by 1)

            return 0.0  # Wrong answer
    except:
        return 0.0


def benchmark_model_tallyqa(model, processor, dataset, model_type):
    device = next(model.parameters()).device
    print(f"Running benchmark for {model_type} model on {device}")

    start_time = time.time()
    scores = []
    inference_times = []
    simple_scores = []
    complex_scores = []
    example_outputs = []

    # Make sure we have a dataset before proceeding
    if dataset is None or len(dataset) == 0:
        print("Empty dataset, skipping benchmark")
        return {
            "accuracy": 0.0,
            "simple_accuracy": 0.0,
            "complex_accuracy": 0.0,
            "avg_inference_time": 0.0,
            "total_time": 0.0,
            "memory_gb": 0.0,
            "examples": []
        }

    # Use tqdm function properly
    for i, example in enumerate(tqdm(dataset[:300])):  # Limit to 300 for faster results
        image = example["image"]
        question = example["question"]
        answers = example["answers"]
        issimple = example["issimple"]

        # Format prompt specifically for counting questions
        prompt = f"<image>Look at the image and answer this counting question: {question} Give just a number as your answer."

        # Process inputs
        try:
            model_inputs = processor(text=prompt, images=image, return_tensors="pt")
            model_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items()}
            input_len = model_inputs["input_ids"].shape[-1]

            # Measure inference time
            torch.cuda.synchronize() if torch.cuda.is_available() else None
            inf_start = time.time()

            with torch.inference_mode():
                generation = model.generate(**model_inputs, max_new_tokens=20, do_sample=False)
                generation = generation[0][input_len:]
                model_answer = processor.decode(generation, skip_special_tokens=True)

            torch.cuda.synchronize() if torch.cuda.is_available() else None
            inference_time = time.time() - inf_start
            inference_times.append(inference_time)

            # Evaluate counting accuracy
            score = evaluate_counting_answer(model_answer, answers)
            scores.append(score)

            # Track simple vs complex questions
            if issimple:
                simple_scores.append(score)
            else:
                complex_scores.append(score)

            # Save examples for later inspection
            if i < 5:
                example_outputs.append({
                    "question": question,
                    "model_answer": model_answer,
                    "ground_truth": answers[0],
                    "score": score,
                    "inference_time": inference_time
                })
        except Exception as e:
            print(f"Error processing example {i}: {e}")
            traceback.print_exc()
            continue

    # Calculate metrics
    avg_score = sum(scores) / len(scores) if scores else 0
    avg_simple = sum(simple_scores) / len(simple_scores) if simple_scores else 0
    avg_complex = sum(complex_scores) / len(complex_scores) if complex_scores else 0
    avg_time = sum(inference_times) / len(inference_times) if inference_times else 0
    total_time = time.time() - start_time

    # Calculate memory usage
    memory_gb = sum(p.numel() * (0.5 if model_type=="int4" else (1 if model_type=="int8" else (2 if model_type in ["bfloat16", "fp16"] else 4)))
                for p in model.parameters()) / (1024 * 1024 * 1024)

    return {
        "accuracy": avg_score,
        "simple_accuracy": avg_simple,
        "complex_accuracy": avg_complex,
        "avg_inference_time": avg_time,
        "total_time": total_time,
        "memory_gb": memory_gb,
        "examples": example_outputs
    }


def run_tallyqa_benchmarks():
    # Load TallyQA dataset
    print("Loading TallyQA dataset...")
    structured_data = load_tallyqa_dataset()

    # Check if we actually got data
    if structured_data is None or len(structured_data) == 0:
        print("Failed to load dataset. Cannot proceed with benchmarking.")
        return {"error": "Failed to load dataset"}

    results = {}

    # Benchmark each model type separately
    for model_type in ["fp32", "int8", "int4"]:
        print(f"\n=== Benchmarking {model_type} model on TallyQA ===")
        try:
            # Clear memory
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

            # Load model
            model, processor = load_model(model_type)
            results[model_type] = benchmark_model_tallyqa(model, processor, structured_data, model_type)

            # Print immediate results
            print(f"\nResults for {model_type}:")
            print(f"Accuracy: {results[model_type]['accuracy']:.4f}")
            print(f"Simple Questions Accuracy: {results[model_type]['simple_accuracy']:.4f}")
            print(f"Complex Questions Accuracy: {results[model_type]['complex_accuracy']:.4f}")
            print(f"Average Inference Time: {results[model_type]['avg_inference_time']:.4f}s")
            print(f"Memory Usage: {results[model_type]['memory_gb']:.2f} GB")

            # Clean up
            del model
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

        except Exception as e:
            print(f"Error benchmarking {model_type} model: {e}")
            traceback.print_exc()
            results[model_type] = {"error": str(e)}

    # Print comparison summary
    print("\n=== TallyQA Benchmark Summary ===")
    print("Model | Overall Acc | Simple Acc | Complex Acc | Avg Time (s) | Memory (GB)")
    print("------|-------------|------------|-------------|--------------|------------")
    for model_type in ["fp32", "int8", "int4"]:
        if model_type in results and "error" not in results[model_type]:
            print(f"{model_type} | {results[model_type]['accuracy']:.4f} | "
                  f"{results[model_type]['simple_accuracy']:.4f} | "
                  f"{results[model_type]['complex_accuracy']:.4f} | "
                  f"{results[model_type]['avg_inference_time']:.4f} | "
                  f"{results[model_type]['memory_gb']:.2f}")

    # Performance comparisons
    if all(model_type in results and "error" not in results[model_type] for model_type in ["fp32", "int8", "int4"]):
        print("\n=== Performance Comparison ===")

        # Avoid division by zero
        if results["fp32"]["accuracy"] > 0:
            print(f"INT8/FP32 accuracy ratio: {results['int8']['accuracy']/results['fp32']['accuracy']:.4f}x")
            print(f"INT4/FP32 accuracy ratio: {results['int4']['accuracy']/results['fp32']['accuracy']:.4f}x")

        if results["int8"]["avg_inference_time"] > 0 and results["int4"]["avg_inference_time"] > 0:
            print(f"INT8/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int8']['avg_inference_time']:.2f}x")
            print(f"INT4/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int4']['avg_inference_time']:.2f}x")

        print(f"INT8/FP32 memory ratio: {results['int8']['memory_gb']/results['fp32']['memory_gb']:.2f}x")
        print(f"INT4/FP32 memory ratio: {results['int4']['memory_gb']/results['fp32']['memory_gb']:.2f}x")

    return results

run_tallyqa_benchmarks()


Loading TallyQA dataset...
Loading TallyQA dataset from vikhyatk/tallyqa-test...


Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Loaded dataset with 26451 items
Prepared 3000 QA pairs for benchmarking

=== Benchmarking fp32 model on TallyQA ===
Loading PaliGemma model with fp32 precision...


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/862M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Running benchmark for fp32 model on cuda:0


100%|██████████| 3000/3000 [04:24<00:00, 11.35it/s]



Results for fp32:
Accuracy: 0.8738
Simple Questions Accuracy: 0.8738
Complex Questions Accuracy: 0.0000
Average Inference Time: 0.0814s
Memory Usage: 1.36 GB

=== Benchmarking int8 model on TallyQA ===
Loading PaliGemma model with int8 precision...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running benchmark for int8 model on cuda:0


100%|██████████| 3000/3000 [16:14<00:00,  3.08it/s]



Results for int8:
Accuracy: 0.8735
Simple Questions Accuracy: 0.8735
Complex Questions Accuracy: 0.0000
Average Inference Time: 0.3168s
Memory Usage: 0.34 GB

=== Benchmarking int4 model on TallyQA ===
Loading PaliGemma model with int4 precision...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running benchmark for int4 model on cuda:0


100%|██████████| 3000/3000 [09:11<00:00,  5.44it/s]



Results for int4:
Accuracy: 0.8712
Simple Questions Accuracy: 0.8712
Complex Questions Accuracy: 0.0000
Average Inference Time: 0.1762s
Memory Usage: 0.40 GB

=== TallyQA Benchmark Summary ===
Model | Overall Acc | Simple Acc | Complex Acc | Avg Time (s) | Memory (GB)
------|-------------|------------|-------------|--------------|------------
fp32 | 0.8738 | 0.8738 | 0.0000 | 0.0814 | 1.36
int8 | 0.8735 | 0.8735 | 0.0000 | 0.3168 | 0.34
int4 | 0.8712 | 0.8712 | 0.0000 | 0.1762 | 0.40

=== Performance Comparison ===
INT8/FP32 accuracy ratio: 0.9996x
INT4/FP32 accuracy ratio: 0.9969x
INT8/FP32 speed ratio: 0.26x
INT4/FP32 speed ratio: 0.46x
INT8/FP32 memory ratio: 0.25x
INT4/FP32 memory ratio: 0.30x


{'fp32': {'accuracy': 0.8738333333333334,
  'simple_accuracy': 0.8738333333333334,
  'complex_accuracy': 0,
  'avg_inference_time': 0.08140486439069113,
  'total_time': 264.2702875137329,
  'memory_gb': 1.3613451644778252,
  'examples': [{'question': 'How many people are there?',
    'model_answer': '2',
    'ground_truth': '2',
    'score': 1.0,
    'inference_time': 0.09145045280456543},
   {'question': 'How many cars are parked?',
    'model_answer': '1',
    'ground_truth': '2',
    'score': 0.5,
    'inference_time': 0.0854034423828125},
   {'question': 'How many outlets are in the wall?',
    'model_answer': '3',
    'ground_truth': '4',
    'score': 0.5,
    'inference_time': 0.07995080947875977},
   {'question': 'How many windows are there?',
    'model_answer': '1',
    'ground_truth': '1',
    'score': 1.0,
    'inference_time': 0.08088564872741699},
   {'question': 'How many apples are on the table?',
    'model_answer': '1',
    'ground_truth': '1',
    'score': 1.0,
    'i

## DocVQA validation evals

In [None]:
!pip install editdistance datasets -q
!pip install datasets --upgrade



In [None]:
import torch
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration, BitsAndBytesConfig
from datasets import load_dataset
from PIL import Image
import time
from tqdm import tqdm
import gc
import numpy as np
import re
import traceback
import editdistance


def load_docvqa_dataset():
    print("Loading DocVQA validation dataset from vikhyatk/docvqa-val...")

    try:
        # Load the DocVQA validation dataset
        dataset = load_dataset("vikhyatk/docvqa-val")['validation']
        print(f"Loaded dataset with {len(dataset)} items")

        # Debug: Print first item structure
        first_item = dataset[0]
        print("First item keys:", list(first_item.keys()))
        if 'image' in first_item:
            if isinstance(first_item['image'], np.ndarray):
                print(f"Image type: numpy array with shape {first_item['image'].shape} and dtype {first_item['image'].dtype}")
            else:
                print(f"Image type: {type(first_item['image'])}")

        # Create structured data for benchmarking
        structured_data = []

        # Use a smaller sample for quicker benchmarking
        sample_size = min(300, len(dataset))

        for i in range(sample_size):
            try:
                item = dataset[i]

                # Debug image information for a few samples
                if i < 3 and 'image' in item:
                    img = item['image']
                    if isinstance(img, np.ndarray):
                        print(f"Item {i} image: shape={img.shape}, dtype={img.dtype}")
                    else:
                        print(f"Item {i} image type: {type(img)}")

                # Check for different dataset structures
                if 'image' in item and 'prompt' in item and 'answer' in item:
                    structured_data.append({
                        "image": item['image'],
                        "question": item['prompt'],
                        "answers": [item['answer']],
                        "question_id": len(structured_data)
                    })
                elif 'image' in item and 'qa' in item:
                    for qa in item['qa']:
                        if 'question' in qa and 'answers' in qa:
                            structured_data.append({
                                "image": item['image'],
                                "question": f"<image>{qa['question']}",
                                "answers": qa['answers'],
                                "question_id": len(structured_data)
                            })

            except Exception as e:
                print(f"Error processing dataset item {i}: {e}")
                continue

        print(f"Prepared {len(structured_data)} QA pairs for benchmarking")

        if len(structured_data) > 0:
            return structured_data
        else:
            print("No valid data found in the dataset")
            return None

    except Exception as e:
        print(f"Error loading DocVQA dataset: {e}")
        traceback.print_exc()
        return None


def load_model(model_type):
    """
    Load PaliGemma model with specified quantization using BitsAndBytesConfig
    """
    print(f"Loading PaliGemma model with {model_type} precision...")

    # Base model ID
    model_id = "google/paligemma-3b-mix-224"
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    # Configure based on quantization type
    if model_type == "fp32" or model_type == "bfloat16":
        # Use bfloat16 as recommended by the authors for best performance
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            device_map=device,
            revision="bfloat16"  # Using the bfloat16 specific revision
        ).eval()
    elif model_type == "int8":
        # Load with 8-bit quantization
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map=device,
            torch_dtype=torch.bfloat16  # Still use bfloat16 as base dtype
        ).eval()
    elif model_type == "int4":
        # Load with 4-bit quantization
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation
        )
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map=device
        ).eval()
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    # Load processor
    processor = AutoProcessor.from_pretrained(model_id)

    return model, processor


def evaluate_docvqa_answer(model_answer, ground_truths):
    """
    Evaluate DocVQA responses using ANLS (Average Normalized Levenshtein Similarity)
    ANLS gives partial credit based on string similarity, with a threshold of 0.5
    """
    if not isinstance(ground_truths, list):
        ground_truths = [ground_truths]

    model_answer = model_answer.lower().strip()
    anls_scores = []

    for gt in ground_truths:
        gt = str(gt).lower().strip()

        # Calculate normalized Levenshtein similarity (1 - normalized edit distance)
        edit_dist = editdistance.eval(model_answer, gt)
        max_len = max(len(model_answer), len(gt))
        if max_len == 0:  # Handle empty strings
            iou = 1.0 if model_answer == gt else 0.0
        else:
            iou = 1.0 - (edit_dist / max_len)

        # ANLS threshold: only count similarity if it's at least 0.5
        anls = iou if iou >= 0.5 else 0.0
        anls_scores.append(anls)

    # Take the max ANLS across all ground truth answers
    return max(anls_scores) if anls_scores else 0.0


def benchmark_model_docvqa(model, processor, dataset, model_type):
    device = next(model.parameters()).device
    print(f"Running benchmark for {model_type} model on {device}")

    start_time = time.time()
    scores = []
    inference_times = []
    example_outputs = []

    # Make sure we have a dataset before proceeding
    if dataset is None or len(dataset) == 0:
        print("Empty dataset, skipping benchmark")
        return {
            "accuracy": 0.0,
            "avg_inference_time": 0.0,
            "total_time": 0.0,
            "memory_gb": 0.0,
            "examples": []
        }

    # Use tqdm for progress tracking
    for i, example in enumerate(tqdm(dataset[:300])):  # Limit to 300 for faster results
        image = example["image"]
        question = example["question"]
        answers = example["answers"]

        # Convert image to RGB if it's grayscale
        try:
            # Convert numpy array to PIL Image if needed
            if isinstance(image, np.ndarray):
                if image.ndim == 2:
                    # This is a grayscale image, convert to RGB
                    image_pil = Image.fromarray(image).convert('RGB')
                elif image.ndim == 3:
                    # Already has channels, but ensure it's RGB
                    image_pil = Image.fromarray(image).convert('RGB')
                else:
                    print(f"Skipping example {i}: Unusual image dimensions: {image.ndim}")
                    continue
            elif hasattr(image, 'convert'):  # Already a PIL Image
                image_pil = image.convert('RGB')
            else:
                print(f"Skipping example {i}: Unknown image type: {type(image)}")
                continue

            # The prompt already has <image> at the start
            prompt = question
            if not prompt.startswith("<image>"):
                prompt = f"<image>{prompt}"

            # Process inputs
            model_inputs = processor(text=prompt, images=image_pil, return_tensors="pt")
            model_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items()}
            input_len = model_inputs["input_ids"].shape[-1]

            # Measure inference time
            torch.cuda.synchronize() if torch.cuda.is_available() else None
            inf_start = time.time()

            with torch.inference_mode():
                generation = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
                generation = generation[0][input_len:]
                model_answer = processor.decode(generation, skip_special_tokens=True)

            torch.cuda.synchronize() if torch.cuda.is_available() else None
            inference_time = time.time() - inf_start
            inference_times.append(inference_time)

            # Evaluate document VQA accuracy
            score = evaluate_docvqa_answer(model_answer, answers)
            scores.append(score)

            # Save examples for later inspection
            if i < 5:
                example_outputs.append({
                    "question": question,
                    "model_answer": model_answer,
                    "ground_truth": answers[0] if isinstance(answers, list) else answers,
                    "score": score,
                    "inference_time": inference_time
                })
        except Exception as e:
            print(f"Error processing example {i}: {e}")
            traceback.print_exc()
            continue

    # Calculate metrics
    avg_score = sum(scores) / len(scores) if scores else 0
    avg_time = sum(inference_times) / len(inference_times) if inference_times else 0
    total_time = time.time() - start_time

    # Calculate memory usage
    memory_gb = sum(p.numel() * (2 if model_type=="int4" else (1 if model_type=="int8" else 4))
                    for p in model.parameters()) / (8 * 1024 * 1024 * 1024)

    return {
        "accuracy": avg_score,
        "avg_inference_time": avg_time,
        "total_time": total_time,
        "memory_gb": memory_gb,
        "examples": example_outputs
    }


def run_docvqa_benchmarks():
    # Load DocVQA dataset
    print("Loading DocVQA dataset...")
    structured_data = load_docvqa_dataset()

    # Check if we actually got data
    if structured_data is None or len(structured_data) == 0:
        print("Failed to load dataset. Cannot proceed with benchmarking.")
        return {"error": "Failed to load dataset"}

    results = {}

    # Benchmark each model type separately
    for model_type in ["fp32", "int8", "int4"]:
        print(f"\n=== Benchmarking {model_type} model on DocVQA ===")
        try:
            # Clear memory
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

            # Load model
            model, processor = load_model(model_type)
            results[model_type] = benchmark_model_docvqa(model, processor, structured_data, model_type)

            # Print immediate results
            print(f"\nResults for {model_type}:")
            print(f"Accuracy: {results[model_type]['accuracy']:.4f}")
            print(f"Average Inference Time: {results[model_type]['avg_inference_time']:.4f}s")
            print(f"Memory Usage: {results[model_type]['memory_gb']:.2f} GB")

            # Clean up
            del model
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

        except Exception as e:
            print(f"Error benchmarking {model_type} model: {e}")
            traceback.print_exc()
            results[model_type] = {"error": str(e)}

    # Print comparison summary
    print("\n=== DocVQA Benchmark Summary ===")
    print("Model | Accuracy | Avg Time (s) | Memory (GB)")
    print("------|----------|--------------|------------")
    for model_type in ["fp32", "int8", "int4"]:
        if model_type in results and "error" not in results[model_type]:
            print(f"{model_type} | {results[model_type]['accuracy']:.4f} | "
                  f"{results[model_type]['avg_inference_time']:.4f} | "
                  f"{results[model_type]['memory_gb']:.2f}")

    # Performance comparisons
    if all(model_type in results and "error" not in results[model_type] for model_type in ["fp32", "int8", "int4"]):
        print("\n=== Performance Comparison ===")

        # Avoid division by zero
        if results["fp32"]["accuracy"] > 0:
            print(f"INT8/FP32 accuracy ratio: {results['int8']['accuracy']/results['fp32']['accuracy']:.4f}x")
            print(f"INT4/FP32 accuracy ratio: {results['int4']['accuracy']/results['fp32']['accuracy']:.4f}x")

        if results["int8"]["avg_inference_time"] > 0 and results["int4"]["avg_inference_time"] > 0:
            print(f"INT8/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int8']['avg_inference_time']:.2f}x")
            print(f"INT4/FP32 speed ratio: {results['fp32']['avg_inference_time']/results['int4']['avg_inference_time']:.2f}x")

        print(f"INT8/FP32 memory ratio: {results['int8']['memory_gb']/results['fp32']['memory_gb']:.2f}x")
        print(f"INT4/FP32 memory ratio: {results['int4']['memory_gb']/results['fp32']['memory_gb']:.2f}x")

    return results

run_docvqa_benchmarks()

Loading DocVQA dataset...
Loading DocVQA validation dataset from vikhyatk/docvqa-val...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/404 [00:00<?, ?B/s]

validation-00000-of-00002.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

validation-00001-of-00002.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1286 [00:00<?, ? examples/s]

Loaded dataset with 1286 items
First item keys: ['image', 'qa']
Image type: <class 'PIL.PngImagePlugin.PngImageFile'>
Item 0 image type: <class 'PIL.PngImagePlugin.PngImageFile'>
Item 1 image type: <class 'PIL.PngImagePlugin.PngImageFile'>
Item 2 image type: <class 'PIL.PngImagePlugin.PngImageFile'>


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-14a084d3509f>", line 338, in <cell line: 0>
    run_docvqa_benchmarks()
  File "<ipython-input-3-14a084d3509f>", line 273, in run_docvqa_benchmarks
    structured_data = load_docvqa_dataset()
                      ^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-3-14a084d3509f>", line 39, in load_docvqa_dataset
    item = dataset[i]
           ~~~~~~~^^^
  File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_dataset.py", line 2782, in __getitem__
    return self._getitem(key)
           ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_dataset.py", line 2767, in _getitem
    formatted_output = format_table(
                       ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/datasets/formatting/formatting.py", line 658

TypeError: object of type 'NoneType' has no len()