# Medical Image Captioning with Integrated Evaluation Metrics

This notebook generates captions for medical images from the ROCOv2-radiology dataset and evaluates them against original captions using standard NLP metrics (BLEU, METEOR, ROUGE).

## Features
- Loads images from ROCOv2-radiology dataset
- Generates captions using Qwen2-VL-2B-Instruct model
- Evaluates generated captions with multiple metrics
- Saves detailed results with metrics
- Provides aggregate statistics and summaries

## 1. Configuration Setup

Configure the parameters for image processing and evaluation.

In [1]:
import os
import gc
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
import warnings

from medical_image_metrics import MetricsAggregator

warnings.filterwarnings('ignore')

print("Imports successful!")

  from .autonotebook import tqdm as notebook_tqdm


Imports successful!


In [None]:
class MedicalConfig:
    """Configuration for medical image captioning."""

    # Dataset paths
    DATASET_PATH = '/home/vortex/CSE 468 AFE/Datasets/ROCOv2-radiology'
    RESULTS_DIR = '/home/vortex/CSE 468 AFE/Project/results_medical'

    # Processing settings
    NUM_IMAGES = 1000  # Change for full dataset (59962 train, 9904 val, 9927 test)
    SPLIT = 'train'    # 'train', 'validation', or 'test'

    # Device and model settings
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

    # Model selection - optimized for medical imaging
    MODEL_NAME = 'Qwen/Qwen2-VL-2B-Instruct'  # Supports medical image understanding

    # Evaluation settings
    COMPUTE_BERT_SCORE = False  # Set to True for semantic similarity (slower)

    # GPU optimization settings
    ENABLE_GPU_CACHE = True      # Enable VRAM pre-loading of images
    BATCH_SIZE = 8               # Images per batch (4, 8, or 16)
    GPU_CACHE_VRAM_LIMIT = 9.0   # Safety limit in GB before stopping cache

    @classmethod
    def validate(cls):
        """Validate configuration and create necessary directories."""
        # Check dataset exists
        if not os.path.exists(cls.DATASET_PATH):
            raise FileNotFoundError(f"Dataset not found at {cls.DATASET_PATH}")

        # Create results directory
        os.makedirs(cls.RESULTS_DIR, exist_ok=True)

        # Print device info
        if torch.cuda.is_available():
            print(f"Device: {torch.cuda.get_device_name(0)}")
            total_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
            print(f"Total VRAM: {total_mem:.1f} GB")
        else:
            print("Device: CPU")

# Validate configuration
MedicalConfig.validate()
print(f"\nConfiguration:")
print(f"  Dataset: {MedicalConfig.DATASET_PATH}")
print(f"  Split: {MedicalConfig.SPLIT}")
print(f"  Num Images: {MedicalConfig.NUM_IMAGES}")
print(f"  Model: {MedicalConfig.MODEL_NAME}")
print(f"  Results Dir: {MedicalConfig.RESULTS_DIR}")
print(f"  GPU Cache Enabled: {MedicalConfig.ENABLE_GPU_CACHE}")
print(f"  Batch Size: {MedicalConfig.BATCH_SIZE}")

## 2. Model Initialization

Load the Qwen2-VL model for medical image understanding.

In [None]:
class MedicalVLMCaptioner:
    """Medical image captioning using Vision-Language Models."""

    def __init__(self, model_name):
        """Initialize medical VLM model."""
        print(f"\nLoading model: {model_name}")
        self.model_name = model_name
        self.device = MedicalConfig.DEVICE

        # Load processor and model
        self.processor = AutoProcessor.from_pretrained(
            model_name,
            trust_remote_code=True
        )

        # Use the correct model class for Qwen2-VL
        from transformers import Qwen2VLForConditionalGeneration
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name,
            device_map='auto',
            dtype=MedicalConfig.DTYPE,
            trust_remote_code=True
        )
        self.model.eval()
        print(f"Model loaded successfully on {self.device}")

    def generate_caption(self, image):
        """
        Generate caption for a single medical image.

        Args:
            image: PIL Image object

        Returns:
            caption: Generated text caption
            processing_time: Time taken in seconds
        """
        import time
        start_time = time.time()

        try:
            # Prepare input for medical image understanding
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": "Describe this medical image in detail."}
                    ]
                }
            ]

            # Process text and image
            text = self.processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )

            # Get image info from messages
            image_inputs, video_inputs = process_vision_info(messages)

            # Create inputs
            inputs = self.processor(
                text=[text],
                images=[image_inputs],
                videos=None,
                padding=True,
                return_tensors='pt'
            )

            inputs = inputs.to(self.device)

            # Generate with medical-specific parameters
            with torch.no_grad():
                output_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=256,
                    do_sample=False
                )

            # Decode output - get only the new tokens generated
            prompt_length = inputs['input_ids'].shape[1]
            generated_ids = output_ids[:, prompt_length:]
            response = self.processor.decode(
                generated_ids[0],
                skip_special_tokens=True
            )

            processing_time = time.time() - start_time
            return response.strip(), processing_time

        except Exception as e:
            print(f"Error generating caption: {e}")
            processing_time = time.time() - start_time
            return f"Error: {str(e)}", processing_time

    def generate_caption_batch(self, batch_inputs):
        """
        Generate captions for a batch of images (GPU pre-loaded).

        Args:
            batch_inputs: Dictionary with batched tensors already on GPU

        Returns:
            captions: List of generated caption strings
            proc_times: List of per-image processing times
        """
        import time
        start_time = time.time()

        try:
            batch_size = batch_inputs['input_ids'].shape[0]

            # Generate captions for entire batch
            with torch.no_grad():
                output_ids = self.model.generate(
                    **batch_inputs,
                    max_new_tokens=256,
                    do_sample=False
                )

            # Decode each caption separately
            captions = []
            prompt_length = batch_inputs['input_ids'].shape[1]

            for i in range(batch_size):
                generated_ids = output_ids[i, prompt_length:]
                caption = self.processor.decode(
                    generated_ids,
                    skip_special_tokens=True
                )
                captions.append(caption.strip())

            processing_time = time.time() - start_time
            per_image_time = processing_time / batch_size if batch_size > 0 else 0

            return captions, [per_image_time] * batch_size

        except Exception as e:
            print(f"Error in batch caption generation: {e}")
            batch_size = batch_inputs['input_ids'].shape[0]
            return [f"Error: {str(e)}"] * batch_size, [0.0] * batch_size

    def cleanup(self):
        """Clean up model memory."""
        del self.model
        del self.processor
        gc.collect()
        torch.cuda.empty_cache()
        print("Model cleaned up")

# Initialize model
captioner = MedicalVLMCaptioner(MedicalConfig.MODEL_NAME)

## 3. Dataset Loading

Load the ROCOv2-radiology dataset.

In [4]:
class ROCOv2Processor:
    """Handle ROCOv2 dataset loading and processing."""

    def __init__(self):
        """Initialize dataset processor."""
        self.dataset = None
        self.split = MedicalConfig.SPLIT

    def load_split(self):
        """Load specified split from ROCOv2 dataset."""
        print(f"\nLoading {self.split} split from ROCOv2...")
        self.dataset = load_dataset(
            'parquet',
            data_files={
                self.split: os.path.join(
                    MedicalConfig.DATASET_PATH,
                    'data',
                    f'{self.split}-*.parquet'
                )
            }
        )[self.split]

        print(f"Loaded {len(self.dataset)} images from {self.split} split")
        return self.dataset

    def get_batch(self, num_images=None):
        """Get batch of images for processing."""
        if num_images is None:
            num_images = MedicalConfig.NUM_IMAGES

        num_images = min(num_images, len(self.dataset))
        return self.dataset.select(range(num_images))

# Load dataset
processor = ROCOv2Processor()
dataset = processor.load_split()
batch = processor.get_batch()

print(f"\nBatch info:")
print(f"  Total images in batch: {len(batch)}")
print(f"  Sample keys: {list(batch[0].keys())}")


Loading train split from ROCOv2...
Loaded 59962 images from train split

Batch info:
  Total images in batch: 1000
  Sample keys: ['image', 'image_id', 'caption', 'cui']


## 4. Caption Generation

Generate captions for all images in the batch with checkpoint saving.

# Initialize results list and metrics aggregator
results = []
metrics_aggregator = MetricsAggregator()

# Initialize GPU cache if enabled
gpu_cache = None
if MedicalConfig.ENABLE_GPU_CACHE:
    from gpu_image_cache import GPUImageCache

    print("\n" + "=" * 100)
    print("GPU Image Cache Enabled - Pre-loading images to VRAM")
    print("=" * 100)

    gpu_cache = GPUImageCache(
        captioner.processor,
        MedicalConfig.DEVICE,
        MedicalConfig.DTYPE,
        vram_limit_gb=MedicalConfig.GPU_CACHE_VRAM_LIMIT
    )
    num_cached = gpu_cache.preprocess_and_cache(
        batch,
        max_images=MedicalConfig.NUM_IMAGES
    )

    print(f"\nSuccessfully cached {num_cached} images")
    vram_status = gpu_cache.get_vram_status()
    print(f"VRAM Status: {vram_status['current_gb']:.2f} GB / {vram_status['total_gb']:.2f} GB ({vram_status['percent_used']:.1f}%)")

print(f"\nProcessing {len(batch)} medical images...")
print("-" * 100)

if gpu_cache is not None:
    # Batched processing with GPU cache
    num_batches = (len(batch) + MedicalConfig.BATCH_SIZE - 1) // MedicalConfig.BATCH_SIZE
    print(f"Processing in batches of {MedicalConfig.BATCH_SIZE}...")

    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        try:
            start_idx = batch_idx * MedicalConfig.BATCH_SIZE
            end_idx = min(start_idx + MedicalConfig.BATCH_SIZE, len(batch))
            indices = list(range(start_idx, end_idx))

            # Get batch from GPU cache
            batch_inputs = gpu_cache.get_batch(indices)

            # Generate captions for entire batch
            captions, proc_times = captioner.generate_caption_batch(batch_inputs)

            # Get metadata for batch
            batch_metadata = gpu_cache.get_metadata(indices)

            # Record results
            for i, idx in enumerate(indices):
                metadata = batch_metadata[i]
                result = {
                    'image_id': metadata['image_id'],
                    'original_caption': metadata['original_caption'],
                    'generated_caption': captions[i],
                    'processing_time_sec': proc_times[i],
                    'model': MedicalConfig.MODEL_NAME,
                    'timestamp': datetime.utcnow().isoformat(),
                    'split': MedicalConfig.SPLIT,
                    'batch_size': MedicalConfig.BATCH_SIZE
                }
                results.append(result)

            # Save checkpoint every 100 images
            if (end_idx) % 100 == 0:
                checkpoint_df = pd.DataFrame(results)
                checkpoint_file = os.path.join(
                    MedicalConfig.RESULTS_DIR,
                    f'checkpoint_{end_idx}.csv'
                )
                checkpoint_df.to_csv(checkpoint_file, index=False)
                print(f"Saved checkpoint at {end_idx} images")

        except Exception as e:
            print(f"Error processing batch {batch_idx}: {e}")
            continue

    # Cleanup GPU cache before evaluation
    gpu_cache.clear_cache()

else:
    # Sequential processing (original behavior)
    print("Processing sequentially...")

    for idx, sample in enumerate(tqdm(batch, desc="Generating captions")):
        try:
            image = sample['image']
            image_id = sample['image_id']
            original_caption = sample.get('caption', '')

            # Generate caption
            caption, proc_time = captioner.generate_caption(image)

            # Record result
            result = {
                'image_id': image_id,
                'original_caption': original_caption,
                'generated_caption': caption,
                'processing_time_sec': proc_time,
                'model': MedicalConfig.MODEL_NAME,
                'timestamp': datetime.utcnow().isoformat(),
                'split': MedicalConfig.SPLIT,
                'batch_size': 1
            }
            results.append(result)

            # Save checkpoint every 100 images
            if (idx + 1) % 100 == 0:
                checkpoint_df = pd.DataFrame(results)
                checkpoint_file = os.path.join(
                    MedicalConfig.RESULTS_DIR,
                    f'checkpoint_{idx + 1}.csv'
                )
                checkpoint_df.to_csv(checkpoint_file, index=False)
                print(f"Saved checkpoint at {idx + 1} images")

        except Exception as e:
            print(f"Error processing image {idx}: {e}")
            continue

print(f"\nCaption generation complete!")
print(f"Total images processed: {len(results)}")

In [5]:
# Initialize results list and metrics aggregator
results = []
metrics_aggregator = MetricsAggregator()

print(f"\nProcessing {len(batch)} medical images...")
print("-" * 100)

# Process each image
for idx, sample in enumerate(tqdm(batch, desc="Generating captions")):
    try:
        image = sample['image']
        image_id = sample['image_id']
        original_caption = sample.get('caption', '')

        # Generate caption
        caption, proc_time = captioner.generate_caption(image)

        # Record result
        result = {
            'image_id': image_id,
            'original_caption': original_caption,
            'generated_caption': caption,
            'processing_time_sec': proc_time,
            'model': MedicalConfig.MODEL_NAME,
            'timestamp': datetime.utcnow().isoformat(),
            'split': MedicalConfig.SPLIT
        }
        results.append(result)

        # Save checkpoint every 100 images
        if (idx + 1) % 100 == 0:
            checkpoint_df = pd.DataFrame(results)
            checkpoint_file = os.path.join(
                MedicalConfig.RESULTS_DIR,
                f'checkpoint_{idx + 1}.csv'
            )
            checkpoint_df.to_csv(checkpoint_file, index=False)
            print(f"Saved checkpoint at {idx + 1} images")

    except Exception as e:
        print(f"Error processing image {idx}: {e}")
        continue

print(f"\nCaption generation complete!")
print(f"Total images processed: {len(results)}")


Processing 1000 medical images...
----------------------------------------------------------------------------------------------------


Generating captions:   0%|          | 0/1000 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating captions:   2%|▎         | 25/1000 [02:28<1:37:43,  6.01s/it]

Saved checkpoint at 25 images


Generating captions:   5%|▌         | 50/1000 [04:53<1:35:11,  6.01s/it]

Saved checkpoint at 50 images


Generating captions:   8%|▊         | 75/1000 [07:21<1:31:21,  5.93s/it]

Saved checkpoint at 75 images


Generating captions:  10%|█         | 100/1000 [09:49<1:28:46,  5.92s/it]

Saved checkpoint at 100 images


Generating captions:  12%|█▎        | 125/1000 [12:17<1:25:32,  5.87s/it]

Saved checkpoint at 125 images


Generating captions:  15%|█▌        | 150/1000 [14:36<1:22:23,  5.82s/it]

Saved checkpoint at 150 images


Generating captions:  18%|█▊        | 175/1000 [17:03<1:18:10,  5.69s/it]

Saved checkpoint at 175 images


Generating captions:  20%|██        | 200/1000 [19:28<1:15:53,  5.69s/it]

Saved checkpoint at 200 images


Generating captions:  22%|██▎       | 225/1000 [21:54<1:15:11,  5.82s/it]

Saved checkpoint at 225 images


Generating captions:  25%|██▌       | 250/1000 [24:18<1:10:55,  5.67s/it]

Saved checkpoint at 250 images


Generating captions:  28%|██▊       | 275/1000 [26:42<1:11:01,  5.88s/it]

Saved checkpoint at 275 images


Generating captions:  30%|███       | 300/1000 [29:10<1:08:51,  5.90s/it]

Saved checkpoint at 300 images


Generating captions:  32%|███▎      | 325/1000 [31:36<1:06:05,  5.87s/it]

Saved checkpoint at 325 images


Generating captions:  35%|███▌      | 350/1000 [34:01<1:03:05,  5.82s/it]

Saved checkpoint at 350 images


Generating captions:  38%|███▊      | 375/1000 [36:27<1:04:38,  6.21s/it]

Saved checkpoint at 375 images


Generating captions:  40%|████      | 400/1000 [38:55<58:59,  5.90s/it]  

Saved checkpoint at 400 images


Generating captions:  42%|████▎     | 425/1000 [41:16<55:08,  5.75s/it]

Saved checkpoint at 425 images


Generating captions:  45%|████▌     | 450/1000 [43:41<53:24,  5.83s/it]

Saved checkpoint at 450 images


Generating captions:  48%|████▊     | 475/1000 [46:09<51:07,  5.84s/it]

Saved checkpoint at 475 images


Generating captions:  50%|█████     | 500/1000 [48:34<46:45,  5.61s/it]

Saved checkpoint at 500 images


Generating captions:  52%|█████▎    | 525/1000 [51:01<47:44,  6.03s/it]

Saved checkpoint at 525 images


Generating captions:  55%|█████▌    | 550/1000 [53:25<44:20,  5.91s/it]

Saved checkpoint at 550 images


Generating captions:  57%|█████▊    | 575/1000 [55:50<40:33,  5.73s/it]

Saved checkpoint at 575 images


Generating captions:  60%|██████    | 600/1000 [58:22<40:05,  6.01s/it]

Saved checkpoint at 600 images


Generating captions:  62%|██████▎   | 625/1000 [1:00:50<38:00,  6.08s/it]

Saved checkpoint at 625 images


Generating captions:  65%|██████▌   | 650/1000 [1:03:20<34:39,  5.94s/it]

Saved checkpoint at 650 images


Generating captions:  68%|██████▊   | 675/1000 [1:05:51<34:26,  6.36s/it]

Saved checkpoint at 675 images


Generating captions:  70%|███████   | 700/1000 [1:08:24<29:51,  5.97s/it]

Saved checkpoint at 700 images


Generating captions:  72%|███████▎  | 725/1000 [1:10:51<28:04,  6.13s/it]

Saved checkpoint at 725 images


Generating captions:  75%|███████▌  | 750/1000 [1:13:20<24:58,  5.99s/it]

Saved checkpoint at 750 images


Generating captions:  78%|███████▊  | 775/1000 [1:15:44<21:37,  5.77s/it]

Saved checkpoint at 775 images


Generating captions:  80%|████████  | 800/1000 [1:18:11<19:48,  5.94s/it]

Saved checkpoint at 800 images


Generating captions:  82%|████████▎ | 825/1000 [1:20:36<16:04,  5.51s/it]

Saved checkpoint at 825 images


Generating captions:  85%|████████▌ | 850/1000 [1:23:05<14:44,  5.90s/it]

Saved checkpoint at 850 images


Generating captions:  88%|████████▊ | 875/1000 [1:25:32<12:20,  5.92s/it]

Saved checkpoint at 875 images


Generating captions:  90%|█████████ | 900/1000 [1:27:55<09:00,  5.40s/it]

Saved checkpoint at 900 images


Generating captions:  92%|█████████▎| 925/1000 [1:30:23<07:26,  5.96s/it]

Saved checkpoint at 925 images


Generating captions:  95%|█████████▌| 950/1000 [1:32:47<04:51,  5.83s/it]

Saved checkpoint at 950 images


Generating captions:  98%|█████████▊| 975/1000 [1:35:11<02:26,  5.85s/it]

Saved checkpoint at 975 images


Generating captions: 100%|██████████| 1000/1000 [1:37:40<00:00,  5.86s/it]

Saved checkpoint at 1000 images

Caption generation complete!
Total images processed: 1000





## 5. Save Raw Results

Save the generated captions to CSV.

In [6]:
# Save results
results_df = pd.DataFrame(results)
output_file = os.path.join(
    MedicalConfig.RESULTS_DIR,
    f'medical_captions_{MedicalConfig.SPLIT}.csv'
)
results_df.to_csv(output_file, index=False)
print(f"Saved results to: {output_file}")

# Display sample results
print(f"\nSample results (first 3 rows):")
print(results_df[['image_id', 'original_caption', 'generated_caption', 'processing_time_sec']].head(3))

Saved results to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_captions_train.csv

Sample results (first 3 rows):
                   image_id  \
0  ROCOv2_2023_train_000001   
1  ROCOv2_2023_train_000002   
2  ROCOv2_2023_train_000003   

                                    original_caption  \
0            Head CT demonstrating left parotiditis.   
1  Acquired renal cysts in end-stage renal failur...   
2  Computed tomography of the chest showing the r...   

                                   generated_caption  processing_time_sec  
0  This medical image is a computed tomography (C...             6.440045  
1  The medical image appears to be an ultrasound ...             6.124556  
2  This medical image is a computed tomography (C...             5.952285  


## 6. Model Cleanup

Free up GPU memory after caption generation.

In [7]:
# Cleanup model
captioner.cleanup()
print("GPU memory freed")

Model cleaned up
GPU memory freed


## 7. Evaluate Captions

Compute evaluation metrics for the generated captions.

In [8]:
print("\n" + "=" * 100)
print("Evaluating Generated Captions")
print("=" * 100)

# Filter out error captions for evaluation
valid_results = results_df[~results_df['generated_caption'].str.startswith('Error')].copy()

print(f"\nTotal captions: {len(results_df)}")
print(f"Valid captions for evaluation: {len(valid_results)}")
print(f"Error captions: {len(results_df) - len(valid_results)}")


Evaluating Generated Captions

Total captions: 1000
Valid captions for evaluation: 1000
Error captions: 0


In [9]:
if len(valid_results) > 0:
    # Compute metrics
    print(f"\nComputing evaluation metrics...")
    detailed_metrics, aggregate_stats = metrics_aggregator.evaluate_batch(
        valid_results['original_caption'].tolist(),
        valid_results['generated_caption'].tolist(),
        include_bert=MedicalConfig.COMPUTE_BERT_SCORE
    )

    # Merge metrics with results
    results_with_metrics = valid_results.copy()
    for col in detailed_metrics.columns:
        if col != 'image_index':
            results_with_metrics[col] = detailed_metrics[col].values

    print(f"Metrics computed successfully!")
else:
    print("No valid captions to evaluate (all captions had errors)")


Computing evaluation metrics...
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Error computing METEOR: No module named 'nltk'
Error computing BLEU: No module named 'nltk'
Er

## 8. Save Evaluation Results

Save detailed metrics and aggregate statistics.

In [10]:
if len(valid_results) > 0:
    # Save results with metrics
    metrics_output_file = os.path.join(
        MedicalConfig.RESULTS_DIR,
        f'medical_captions_{MedicalConfig.SPLIT}_with_metrics.csv'
    )
    results_with_metrics.to_csv(metrics_output_file, index=False)
    print(f"Saved results with metrics to: {metrics_output_file}")

    # Save aggregate statistics
    metrics_aggregator.save_results(
        detailed_metrics,
        aggregate_stats,
        MedicalConfig.RESULTS_DIR,
        f'medical_{MedicalConfig.SPLIT}'
    )

Saved results with metrics to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_captions_train_with_metrics.csv
Saved detailed metrics to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_train_detailed.csv
Saved aggregate statistics to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_train_aggregate.json
Saved summary report to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_train_summary.txt


## 9. Results Summary

Display evaluation metrics summary.

In [11]:
if len(valid_results) > 0:
    # Print summary
    metrics_aggregator.print_summary(
        aggregate_stats, 
        f"Medical Image Captioning Evaluation - {MedicalConfig.SPLIT} split"
    )


Medical Image Captioning Evaluation - train split

Length_Ratio:
  Mean: 15.6755 ± 16.3375
  Range: [1.5645, 215.0000]



## 10. Performance Analysis

Analyze caption generation and evaluation metrics.

In [12]:
print("\n" + "=" * 100)
print("Processing Complete!")
print("=" * 100)
print(f"Total images processed: {len(results)}")
print(f"Valid captions for evaluation: {len(valid_results)}")
print(f"Average processing time per image: {results_df['processing_time_sec'].mean():.2f} seconds")
print(f"Total processing time: {results_df['processing_time_sec'].sum():.2f} seconds ({results_df['processing_time_sec'].sum()/60:.2f} minutes)")
print(f"\nResults saved to: {output_file}")


Processing Complete!
Total images processed: 1000
Valid captions for evaluation: 1000
Average processing time per image: 5.85 seconds
Total processing time: 5853.17 seconds (97.55 minutes)

Results saved to: /home/vortex/CSE 468 AFE/Project/results_medical/medical_captions_train.csv


## 11. Detailed Results Inspection

Inspect detailed metrics for individual captions.

In [13]:
if len(valid_results) > 0:
    # Show detailed metrics
    print("\nDetailed Metrics Summary:")
    print(detailed_metrics.describe())
    
    # Show top and bottom performing captions
    print("\n" + "=" * 100)
    print("Top 5 Best Performing Captions (by BLEU score)")
    print("=" * 100)
    if 'BLEU' in detailed_metrics.columns:
        top_indices = detailed_metrics.nlargest(5, 'BLEU')['image_index'].values
        for i, idx in enumerate(top_indices, 1):
            print(f"\n{i}. Image ID: {valid_results.iloc[int(idx)]['image_id']}")
            print(f"   Original: {valid_results.iloc[int(idx)]['original_caption'][:100]}...")
            print(f"   Generated: {valid_results.iloc[int(idx)]['generated_caption'][:100]}...")
            print(f"   BLEU: {detailed_metrics.iloc[int(idx)]['BLEU']:.4f}")


Detailed Metrics Summary:
       Length_Ratio  image_index
count   1000.000000  1000.000000
mean      15.675468   499.500000
std       16.337478   288.819436
min        1.564516     0.000000
25%        7.173929   249.750000
50%       11.193750   499.500000
75%       17.931818   749.250000
max      215.000000   999.000000

Top 5 Best Performing Captions (by BLEU score)


In [14]:
if len(valid_results) > 0:
    print("\n" + "=" * 100)
    print("Bottom 5 Lowest Performing Captions (by BLEU score)")
    print("=" * 100)
    if 'BLEU' in detailed_metrics.columns:
        bottom_indices = detailed_metrics.nsmallest(5, 'BLEU')['image_index'].values
        for i, idx in enumerate(bottom_indices, 1):
            print(f"\n{i}. Image ID: {valid_results.iloc[int(idx)]['image_id']}")
            print(f"   Original: {valid_results.iloc[int(idx)]['original_caption'][:100]}...")
            print(f"   Generated: {valid_results.iloc[int(idx)]['generated_caption'][:100]}...")
            print(f"   BLEU: {detailed_metrics.iloc[int(idx)]['BLEU']:.4f}")


Bottom 5 Lowest Performing Captions (by BLEU score)


## 12. Visualizations (Optional)

Create visualizations for metrics analysis.

In [15]:
# Optional: Import visualization libraries
try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    if len(valid_results) > 0 and 'BLEU' in detailed_metrics.columns:
        # Create visualization
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        # BLEU distribution
        axes[0, 0].hist(detailed_metrics['BLEU'], bins=30, edgecolor='black', alpha=0.7)
        axes[0, 0].set_title('BLEU Score Distribution')
        axes[0, 0].set_xlabel('BLEU Score')
        axes[0, 0].set_ylabel('Frequency')
        
        # METEOR distribution
        if 'METEOR' in detailed_metrics.columns:
            axes[0, 1].hist(detailed_metrics['METEOR'], bins=30, edgecolor='black', alpha=0.7, color='orange')
            axes[0, 1].set_title('METEOR Score Distribution')
            axes[0, 1].set_xlabel('METEOR Score')
            axes[0, 1].set_ylabel('Frequency')
        
        # ROUGE-L distribution
        if 'ROUGE-L' in detailed_metrics.columns:
            axes[1, 0].hist(detailed_metrics['ROUGE-L'], bins=30, edgecolor='black', alpha=0.7, color='green')
            axes[1, 0].set_title('ROUGE-L Score Distribution')
            axes[1, 0].set_xlabel('ROUGE-L Score')
            axes[1, 0].set_ylabel('Frequency')
        
        # Processing time distribution
        axes[1, 1].hist(valid_results['processing_time_sec'], bins=30, edgecolor='black', alpha=0.7, color='red')
        axes[1, 1].set_title('Processing Time Distribution')
        axes[1, 1].set_xlabel('Processing Time (seconds)')
        axes[1, 1].set_ylabel('Frequency')
        
        plt.tight_layout()
        plt.savefig(os.path.join(MedicalConfig.RESULTS_DIR, 'metrics_visualization.png'), dpi=100, bbox_inches='tight')
        plt.show()
        print(f"\nVisualization saved to: {os.path.join(MedicalConfig.RESULTS_DIR, 'metrics_visualization.png')}")
except ImportError:
    print("Matplotlib/Seaborn not installed. Skipping visualizations.")
except Exception as e:
    print(f"Error creating visualizations: {e}")

Matplotlib/Seaborn not installed. Skipping visualizations.
