# Multi-VLM Image Captioning Framework
## Comparing 7 Vision-Language Models on COCO Dataset
This notebook compares multiple lightweight VLM models optimized for RTX 5080 (16GB VRAM)

In [1]:
# Only needed if running on Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

# Install required packages for vision-language models
!pip install transformers torch torchvision Pillow datasets pycocotools
!pip install -q accelerate bitsandbytes qwen-vl-utils

Collecting torchvision
  Using cached torchvision-0.24.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Collecting datasets
  Using cached datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Using cached pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Using cached multiprocess-0.70.18-py310-none-any.whl.metadata (7.5 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Using cached aiohttp-3.13.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (8.1 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.

## Setup and Configuration

In [None]:
from pycocotools.coco import COCO
import requests
import os
from PIL import Image
import json
import pandas as pd
from tqdm import tqdm
import time
import torch
import gc
from datetime import datetime

# Configuration
ANNOTATIONS_PATH = '/home/vortex/CSE 468 AFE/Project/annotations'
IMAGES_DIR = 'coco_images'
RESULTS_DIR = 'results'
NUM_IMAGES = 1000  # Processing 1000 images

# Create necessary directories
os.makedirs(IMAGES_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"Setup complete. Using annotations from: {ANNOTATIONS_PATH}")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Load COCO Dataset

In [3]:
# Load COCO annotations from existing local copy
coco = COCO(os.path.join(ANNOTATIONS_PATH, 'captions_val2017.json'))

# Get 200 random image IDs for evaluation
import random
random.seed(42)
all_img_ids = coco.getImgIds()
selected_img_ids = random.sample(all_img_ids, NUM_IMAGES)

print(f"Selected {len(selected_img_ids)} images for processing")

# Download images if they don't exist
missing_count = 0
for img_id in tqdm(selected_img_ids, desc="Checking images"):
    img_info = coco.loadImgs(img_id)[0]
    img_path = os.path.join(IMAGES_DIR, img_info['file_name'])
    
    if not os.path.exists(img_path):
        try:
            img_url = img_info['coco_url']
            img_data = requests.get(img_url).content
            with open(img_path, 'wb') as f:
                f.write(img_data)
        except Exception as e:
            print(f"Failed to download {img_info['file_name']}: {e}")
            missing_count += 1

print(f"Downloaded {NUM_IMAGES - missing_count}/{NUM_IMAGES} images")

loading annotations into memory...
Done (t=0.09s)
creating index...
index created!
Selected 1000 images for processing


Checking images: 100%|██████████| 1000/1000 [42:53<00:00,  2.57s/it] 

Downloaded 1000/1000 images





## Image Patch Extraction (Optional - Not used with modern VLMs)

In [4]:
# Modern VLMs handle image preprocessing internally, so we don't need manual patch extraction
# Keeping this function for reference in case needed for future experiments

# def extract_patches(image, patch_size=224, stride=112):
#     Extract overlapping patches from image
#     patches = []
#     width, height = image.size
#     for y in range(0, height - patch_size + 1, stride):
#         for x in range(0, width - patch_size + 1, stride):
#             patch = image.crop((x, y, x + patch_size, y + patch_size))
#             patches.append(patch)
#     return patches

# Test on first image
# test_img = Image.open(os.path.join(IMAGES_DIR, os.listdir(IMAGES_DIR)[0]))
# patches = extract_patches(test_img)
# print(f"Extracted {len(patches)} patches from test image")

print("Patch extraction code is available but commented out since models handle resizing internally")

Patch extraction code is available but commented out since models handle resizing internally


## VLM Model 1: Qwen2-VL-2B-Instruct

In [5]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

class QwenVLM:
    """Qwen2-VL-2B - Alibaba's compact vision-language model
    Optimized for 1000 image processing with reduced tokens for speed"""
    
    def __init__(self):
        self.model_name = "Qwen/Qwen2-VL-2B-Instruct"
        self.display_name = "Qwen2-VL-2B"
        self.vram_estimate = "5-6 GB"
        
        print(f"Loading {self.display_name}...")
        self.processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True)
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            self.model_name,
            dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        print(f"Loaded {self.display_name}")
    
    def generate_caption(self, image, max_tokens=128):
        """Generate caption for single image
        Reduced from 256 to 128 tokens for faster inference on 1000 images"""
        try:
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": "Describe this image briefly."}  # Changed to 'briefly' for shorter captions
                ]
            }]
            
            text = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            image_inputs, video_inputs = process_vision_info(messages)
            
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=max_tokens)
            
            full_output = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
            
            if "assistant" in full_output:
                caption = full_output.split("assistant")[-1].strip()
            else:
                caption = full_output
            
            return caption
        except Exception as e:
            return f"Error: {str(e)[:100]}"
    
    def unload(self):
        """Free up GPU memory"""
        del self.model
        del self.processor
        torch.cuda.empty_cache()
        gc.collect()

# Load model
print("Initializing Qwen model - this may take 2-3 minutes on first run...")
qwen_model = QwenVLM()
print(f"Estimated VRAM usage: {qwen_model.vram_estimate}")
print(f"Ready to process 1000 images (~2-3 seconds per image = 33-50 minutes total)")

Fetching 2 files: 100%|██████████| 2/2 [10:27<00:00, 313.58s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s]


Loaded Qwen2-VL-2B
Estimated VRAM usage: 5-6 GB


## Multi-Model Processing Pipeline

In [6]:
# Define all models with their classes and parameters
MODELS_CONFIG = [
    {"name": "Qwen2-VL-2B", "enabled": True},
]

# Get list of images to process
image_files = [f for f in os.listdir(IMAGES_DIR) if f.endswith('.jpg')]
image_files = image_files[:NUM_IMAGES]

print(f"Ready to process {len(image_files)} images")
print(f"Expected time per image: 2-3 seconds with Qwen")
print(f"Total estimated time: {len(image_files) * 2.5 / 60:.1f} minutes")

Ready to process 1000 images
Expected time per image: 2-3 seconds with Qwen
Total estimated time: 41.7 minutes


In [7]:
import warnings
warnings.filterwarnings('ignore')

# Process with Qwen model - optimized for 1000 images
all_results = []

model_name = "Qwen2-VL-2B"
print(f"\n{'='*80}")
print(f"Processing 1000 images with: {model_name}")
print(f"{'='*80}")
print(f"Estimated time: 33-50 minutes")
print(f"Checkpoints will be saved every 100 images")
print(f"{'='*80}\n")

model_results = []
start_time = time.time()

for idx, img_file in enumerate(tqdm(image_files, desc=f"Processing with {model_name}")):
    try:
        img_path = os.path.join(IMAGES_DIR, img_file)
        image = Image.open(img_path).convert('RGB')
        
        img_id = img_file.replace('.jpg', '')
        img_size = image.size
        
        start_inference = time.time()
        caption = qwen_model.generate_caption(image, max_tokens=128)
        inference_time = time.time() - start_inference
        
        result = {
            'image_id': img_id,
            'model_name': model_name,
            'caption': caption,
            'processing_time_sec': round(inference_time, 2),
            'image_width': img_size[0],
            'image_height': img_size[1],
            'timestamp': datetime.now().isoformat()
        }
        
        model_results.append(result)
        all_results.append(result)
        
        # Save checkpoint every 100 images (more frequent for 1000 image run)
        if (idx + 1) % 100 == 0:
            checkpoint_df = pd.DataFrame(model_results)
            checkpoint_path = os.path.join(RESULTS_DIR, f"checkpoint_{model_name}_{idx+1}.csv")
            checkpoint_df.to_csv(checkpoint_path, index=False)
            elapsed = (time.time() - start_time) / 60\n            avg_time = elapsed / (idx + 1)\n            remaining = avg_time * (len(image_files) - idx - 1) / 60\n            tqdm.write(f\"✓ Checkpoint {idx+1}/1000: {elapsed:.1f}m elapsed, ~{remaining:.0f}m remaining\")\n    \n    except Exception as e:\n        tqdm.write(f\"Error processing {img_file}: {str(e)[:50]}\")\n        continue\n\nmodel_df = pd.DataFrame(model_results)\nresult_path = os.path.join(RESULTS_DIR, f\"results_{model_name}.csv\")\nmodel_df.to_csv(result_path, index=False)\n\nelapsed_time = (time.time() - start_time) / 60\nsuccessful = len([r for r in model_results if not r['caption'].startswith('Error')])\n\nprint(f\"\\n{'='*80}\")\nprint(f\"COMPLETED {model_name}\")\nprint(f\"{'='*80}\")\nprint(f\"Processed: {len(model_results)}/1000 images\")\nprint(f\"Successful: {successful}/{len(model_results)} ({successful/len(model_results)*100:.1f}%)\")\nprint(f\"Total time: {elapsed_time:.1f} minutes ({elapsed_time/60:.1f} hours)\")\nprint(f\"Avg per image: {elapsed_time * 60 / len(model_results):.2f}s\")\nprint(f\"Results saved to: {result_path}\")\nprint(f\"{'='*80}\")\n\nqwen_model.unload()\n\nif all_results:\n    combined_df = pd.DataFrame(all_results)\n    combined_path = os.path.join(RESULTS_DIR, 'all_models_comparison.csv')\n    combined_df.to_csv(combined_path, index=False)\n    \n    print(f\"\\nCombined results saved to: {combined_path}\")\n    print(f\"Total results in database: {len(all_results)}\")


Processing with: Qwen2-VL-2B


Processing with Qwen2-VL-2B:  15%|█▌        | 150/1000 [13:48<1:14:31,  5.26s/it]

Checkpoint 150/1000: 13.8m elapsed


Processing with Qwen2-VL-2B:  20%|██        | 200/1000 [18:07<1:06:59,  5.02s/it]

Checkpoint 200/1000: 18.1m elapsed


Processing with Qwen2-VL-2B:  25%|██▌       | 250/1000 [22:36<1:13:34,  5.89s/it]

Checkpoint 250/1000: 22.6m elapsed


Processing with Qwen2-VL-2B:  30%|███       | 300/1000 [27:02<1:02:42,  5.37s/it]

Checkpoint 300/1000: 27.0m elapsed


Processing with Qwen2-VL-2B:  35%|███▌      | 350/1000 [31:21<1:00:22,  5.57s/it]

Checkpoint 350/1000: 31.4m elapsed


Processing with Qwen2-VL-2B:  40%|████      | 400/1000 [35:29<55:09,  5.52s/it]  

Checkpoint 400/1000: 35.5m elapsed


Processing with Qwen2-VL-2B:  45%|████▌     | 450/1000 [39:53<44:00,  4.80s/it]

Checkpoint 450/1000: 39.9m elapsed


Processing with Qwen2-VL-2B:  50%|█████     | 500/1000 [44:14<43:38,  5.24s/it]

Checkpoint 500/1000: 44.2m elapsed


Processing with Qwen2-VL-2B:  55%|█████▌    | 550/1000 [48:18<31:25,  4.19s/it]

Checkpoint 550/1000: 48.3m elapsed


Processing with Qwen2-VL-2B:  60%|██████    | 600/1000 [52:44<37:20,  5.60s/it]

Checkpoint 600/1000: 52.7m elapsed


Processing with Qwen2-VL-2B:  65%|██████▌   | 650/1000 [57:06<26:11,  4.49s/it]

Checkpoint 650/1000: 57.1m elapsed


Processing with Qwen2-VL-2B:  70%|███████   | 700/1000 [1:01:30<27:12,  5.44s/it]

Checkpoint 700/1000: 61.5m elapsed


Processing with Qwen2-VL-2B:  75%|███████▌  | 750/1000 [1:05:43<20:00,  4.80s/it]

Checkpoint 750/1000: 65.7m elapsed


Processing with Qwen2-VL-2B:  80%|████████  | 800/1000 [1:09:49<14:54,  4.47s/it]

Checkpoint 800/1000: 69.8m elapsed


Processing with Qwen2-VL-2B:  85%|████████▌ | 850/1000 [1:14:07<14:23,  5.76s/it]

Checkpoint 850/1000: 74.1m elapsed


Processing with Qwen2-VL-2B:  90%|█████████ | 900/1000 [1:18:21<08:24,  5.05s/it]

Checkpoint 900/1000: 78.4m elapsed


Processing with Qwen2-VL-2B:  95%|█████████▌| 950/1000 [1:22:41<04:08,  4.97s/it]

Checkpoint 950/1000: 82.7m elapsed


Processing with Qwen2-VL-2B: 100%|██████████| 1000/1000 [1:26:55<00:00,  5.22s/it]

Checkpoint 1000/1000: 86.9m elapsed

Completed Qwen2-VL-2B:
  Processed: 1000/1000 images
  Successful: 1000/1000
  Time: 86.9 minutes
  Avg per image: 5.2s
  Saved to: results/results_Qwen2-VL-2B.csv

PROCESSING COMPLETE
Total results: 1000
Saved to: results/all_models_comparison.csv





## Results Analysis

In [8]:
combined_path = os.path.join(RESULTS_DIR, 'all_models_comparison.csv')
if os.path.exists(combined_path):
    results_df = pd.read_csv(combined_path)
    
    print(f"\n{'='*80}")
    print("RESULTS SUMMARY")
    print(f"{'='*80}")
    
    for model_name in results_df['model_name'].unique():
        model_data = results_df[results_df['model_name'] == model_name]
        caption_lengths = model_data['caption'].str.len()
        
        print(f"\n{model_name}:")
        print(f"  Total images: {len(model_data)}")
        print(f"  Successful: {len([c for c in model_data['caption'] if not c.startswith('Error')])}")
        print(f"  Avg caption length: {caption_lengths.mean():.0f} characters")
        print(f"  Caption length range: {caption_lengths.min()}-{caption_lengths.max()}")
        print(f"  Avg inference time: {model_data['processing_time_sec'].mean():.2f}s per image")
    
    # Also save as JSON for flexibility
    json_path = os.path.join(RESULTS_DIR, 'all_models_comparison.json')
    results_df.to_json(json_path, orient='records', indent=2)
    print(f"\n{'='*80}")
    print(f"Results exported to:")
    print(f"  CSV: {combined_path}")
    print(f"  JSON: {json_path}")
    print(f"{'='*80}")
else:
    print("No results file found.")


RESULTS SUMMARY

Qwen2-VL-2B:
  Total images: 1000
  Successful: 1000
  Avg caption length: 1082 characters
  Caption length range: 437-1424
  Avg inference time: 5.21s per image


## OLD CODE - Kept for Reference (Commented Out)

In [9]:
# ============================================
# Old Gemini API approach - NO LONGER USED
# ============================================
# Switched to using open-source models instead
# Benefits: No API costs, no rate limiting, full local control

# import google.generativeai as genai
#
# GEMINI_API_KEY = "your_api_key_here"
# genai.configure(api_key=GEMINI_API_KEY)
#
# def generate_gemini_caption(image):
#     try:
#         gemini_model = genai.GenerativeModel('models/gemini-2.5-pro')
#         response = gemini_model.generate_content([
#             "Describe this image in detail.",
#             image
#         ])
#         return response.text
#     except Exception as e:
#         return f"Error: {str(e)}"
#
# ============================================
# Old manual patch extraction - NOT NEEDED
# ============================================
# Modern VLMs handle this internally
#
# def extract_patches(image, patch_size=224, stride=112):
#     patches = []
#     width, height = image.size
#     for y in range(0, height - patch_size + 1, stride):
#         for x in range(0, width - patch_size + 1, stride):
#             patch = image.crop((x, y, x + patch_size, y + patch_size))
#             patches.append(patch)
#     return patches

print("Old code is kept here for reference only")
print("We are using open-source VLM models for better control and no API costs")

Old code is kept here for reference only
We are using open-source VLM models for better control and no API costs
