In [1]:

from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionProcessor
from utils import MultiModalDataset, MultiModalCollator
import torch
from torch.utils.data import Dataset
from PIL import Image
import av
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any
import os


from trainer import MoETrainer, MoE3Arguments

from transformers import TrainingArguments
from evaluation import run_full_evaluation

from datasets import load_from_disk
from transformers import TrainingArguments, Trainer

    
from torch import nn



from MoE3LoRA import apply_peft

cache_dir = "/ocean/projects/cis250258p/mkowsher/hf_cache"
model_name = "/ocean/projects/cis250258p/mkowsher/hf_cache/llava-onevision-qwen2-7b-ov-hf"
save_csv="evaluation_results_lora.csv"
data_root="/ocean/projects/cis250258p/mkowsher/dataset/MVBench"
dataset_name="hf_mvbench_updated"

# Load processor
processor = LlavaOnevisionProcessor.from_pretrained(model_name, cache_dir=cache_dir)

# ‚≠ê CRITICAL: Set fixed resolution BEFORE using processor
processor.image_processor.image_grid_pinpoints = [[384, 384]]
processor.image_processor.size = {"height": 384, "width": 384}
processor.tokenizer.padding_side = "left" 

if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# Load model
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir=cache_dir,
)


# ‚≠ê CRITICAL: Update model config to match
model.config.image_grid_pinpoints = [[384, 384]]

# ‚≠ê VERIFY the settings
print("="*50)
print("VERIFICATION:")
print("="*50)
print(f"Processor image_grid_pinpoints: {processor.image_processor.image_grid_pinpoints}")
print(f"Model config image_grid_pinpoints: {model.config.image_grid_pinpoints}")
print(f"Vision config image_size: {model.config.vision_config.image_size}")
print("="*50)

model.gradient_checkpointing_enable()

# Load dataset
dataset = load_from_disk(dataset_name)

train_dataset = MultiModalDataset(
    dataset=dataset['train'],
    processor=processor,
    data_root=data_root,
    num_video_frames=8,
    max_length=2048,
)
collator = MultiModalCollator(
    processor=processor,
    max_length=2048,
)

# ============ TEST FIRST ============
print("\n" + "="*50)
print("TESTING SINGLE SAMPLES:")
print("="*50)

# Test one of each modality
for i in range(min(50, len(train_dataset))):
    sample = train_dataset[i]
    src_type = sample.get('source_type', 'unknown')
    print(f"\nSample {i} ({src_type}):")
    print(f"  input_ids: {sample['input_ids'].shape}")
    if 'pixel_values' in sample:
        print(f"  pixel_values: {sample['pixel_values'].shape}")
    if 'image_sizes' in sample:
        print(f"  image_sizes: {sample['image_sizes']}")
    if 'pixel_values_videos' in sample:
        print(f"  pixel_values_videos: {sample['pixel_values_videos'].shape}")
    
    # Stop after finding one of each
    if i > 10:
        break

# Test collator with small batch
print("\n" + "="*50)
print("TESTING COLLATOR:")
print("="*50)

test_samples = [train_dataset[i] for i in range(4)]
test_batch = collator(test_samples)
print("Batch keys:", list(test_batch.keys()))
for k, v in test_batch.items():
    if isinstance(v, torch.Tensor):
        print(f"  {k}: {v.shape}")



import torch

# Collect exactly 20 samples from the PROCESSED dataset
samples = []
for i in range(20):
    samples.append(train_dataset[i])  # ‚úÖ Use train_dataset, not dataset['train']

# Track modalities before collating
modalities = [s['source_type'] for s in samples]

# Run through collator
batch = collator(samples)

# Inspect tensor shapes
print("="*70)
print("BATCH TENSOR SHAPES:")
print("="*70)
for k, v in batch.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: {v.shape}, dtype={v.dtype}")
    elif isinstance(v, list):
        print(f"{k}: list of {len(v)} items")

# Check modalities
print("\n" + "="*70)
print("SAMPLE MODALITIES:")
print("="*70)
for i, mod in enumerate(modalities):
    print(f"Sample {i}: {mod}")

# Count by modality
from collections import Counter
mod_counts = Counter(modalities)
print(f"\nModality distribution: {dict(mod_counts)}")

# Check visual data
print("\n" + "="*70)
print("VISUAL DATA INFO:")
print("="*70)
if 'pixel_values' in batch:
    print(f"pixel_values (images): {batch['pixel_values'].shape}")
    print(f"  - Number of image samples in batch: {modalities.count('image')}")
else:
    print("No images in this batch")

if 'pixel_values_videos' in batch:
    print(f"pixel_values_videos: {batch['pixel_values_videos'].shape}")
    print(f"  - Number of video samples in batch: {modalities.count('video')}")
else:
    print("No videos in this batch")

if 'image_sizes' in batch:
    print(f"image_sizes: {batch['image_sizes'].shape}")

# Decode and inspect
print("\n" + "="*70)
print("DECODED TEXT (last 150 tokens of each sample):")
print("="*70)

for i in range(min(10, len(samples))):  # First 10 for readability
    modality = modalities[i]
    
    # Get last 150 tokens
    last_tokens = batch["input_ids"][i][-150:]
    decoded = processor.tokenizer.decode(last_tokens, skip_special_tokens=False)
    
    # Get labels
    labels = batch["labels"][i]
    label_tokens = labels[labels != -100]
    label_text = processor.tokenizer.decode(label_tokens, skip_special_tokens=False) if len(label_tokens) > 0 else "[NO LABELS]"
    
    print(f"\n{'='*70}")
    print(f"SAMPLE {i} | Modality: {modality}")
    print(f"{'='*70}")
    print(f"Last 150 tokens decoded:\n{decoded}")
    print(f"\nüìù LABELS (what model learns to predict):\n{label_text}")
    print(f"Label token count: {len(label_tokens)}")

# Verify token-feature alignment for images
print("\n" + "="*70)
print("TOKEN-FEATURE ALIGNMENT CHECK:")
print("="*70)
image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
video_token_id = processor.tokenizer.convert_tokens_to_ids("<video>")

total_image_tokens = (batch['input_ids'] == image_token_id).sum().item()
total_video_tokens = (batch['input_ids'] == video_token_id).sum().item()

print(f"Total <image> tokens in batch: {total_image_tokens}")
print(f"Total <video> tokens in batch: {total_video_tokens}")

if 'pixel_values' in batch:
    # For fixed 384x384, each image = 1 patch = 729 features (27x27) after pooling
    # But depends on model config
    print(f"pixel_values shape: {batch['pixel_values'].shape}")
    
if 'pixel_values_videos' in batch:
    print(f"pixel_values_videos shape: {batch['pixel_values_videos'].shape}")
    
    


targets = ["q_proj", "k_proj", "v_proj", "o_proj", "out_proj"]


def lm_only_targets(path, module):
    # Only wrap modules under model.language_model
    if not path.startswith("model.language_model."):
        return False

    # Expect paths like: model.language_model.layers.<idx>.*
    if ".layers." not in path:
        return False

    try:
        layer_id = int(path.split(".layers.")[1].split(".")[0])
    except (IndexError, ValueError):
        return False


    return any(path.endswith(name) for name in targets)

model = apply_peft(
    model,
    targets=targets,
    num_experts=4,
    rank=2,
    use_shared_moe3=False,
    n_gram=2,
    top_k=1,
    rep_mode="token",
    jitter_noise=0.1,
    tokenizer=processor.tokenizer,
    temperature=0.5,
    gamma_routing = 0.7, 
    auto_topk=True, 
    auto_topk_threshold=0.5, 
    peft_dtype=torch.float32,   # A, B in float32
    moe_dtype=torch.float32,    # moe3s, gamma in float32

)


# 1) Count params that require grad
trainable = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
total = sum(p.numel() for _, p in model.named_parameters())
trainable_num = sum(p.numel() for _, p in trainable)
print(f"trainable params: {trainable_num:,} / {total:,}")




#model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
training_args = MoE3Arguments(
    output_dir="./llava-lora-finetuned",
    per_device_train_batch_size=5,
    gradient_accumulation_steps=4,  
    save_total_limit=2,
    save_steps=500000,
    num_train_epochs=2,
    bf16=True,  
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=False, 

    eval_steps=100,
    save_strategy="steps",
    optim="adamw_bnb_8bit",
    learning_rate=2e-4,
    warmup_ratio=0.03,
    weight_decay=0.01,
    report_to="none",
    disable_tqdm=False,          # makes it print log lines instead of tqdm bar behavior
    #log_level="info",
    #logging_first_step=True,
    moe_lr=1e-3,          # For propulsions, gamma (float32)
    peft_lr=2e-4,         # For LoRA A/B (float32)
    importance_coef=0.1,
    kl_coef=0.01,
  
)

# Also make sure model doesn't have it enabled
model.gradient_checkpointing_disable()  # ‚≠ê Call this explicitly



# Example instantiation:
trainer = MoETrainer(
    model=model,
    args=training_args,                  # your HF TrainingArguments
    train_dataset=train_dataset,

    tokenizer=processor.tokenizer,
    data_collator=collator,  # ‚úÖ Custom collator dynamically pads batch sequences


)
trainer.train() 



model.eval()
    
# Full evaluation
results = run_full_evaluation(
    model=model,
    processor=processor,
    dataset=dataset,
    data_root=data_root,
    num_samples_per_split=1000,
    batch_size=6,
    max_new_tokens=50, 
    save_csv=save_csv,
)






  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [03:38<00:00, 54.53s/it]


VERIFICATION:
Processor image_grid_pinpoints: [[384, 384]]
Model config image_grid_pinpoints: [[384, 384]]
Vision config image_size: 384

TESTING SINGLE SAMPLES:

Sample 0 (image):
  input_ids: torch.Size([1509])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 1 (text):
  input_ids: torch.Size([59])

Sample 2 (image):
  input_ids: torch.Size([1508])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 3 (video):
  input_ids: torch.Size([1614])
  pixel_values_videos: torch.Size([8, 3, 384, 384])

Sample 4 (image):
  input_ids: torch.Size([1525])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 5 (text):
  input_ids: torch.Size([50])

Sample 6 (image):
  input_ids: torch.Size([1522])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([32, 32])

Sample 7 (text):
  input_ids: torch.Size([83])

Sample 8 (text):
  input_ids: torch.Size([48])

Sample 9 (text):
  input

  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


trainable params: 3,137,536 / 8,033,945,120
Parameter Groups
[MOE] 216 tensors, 1,396,736 params, lr=1.00e-03
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.moe3s
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.moe3s
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.moe3s
[PEFT] 432 tensors, 1,740,800 params, lr=2.00e-04
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.A
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.B
  - model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.A


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,5.3601
200,0.259
300,0.1976
400,0.1977
500,0.1904
600,0.1768
700,0.1708
800,0.1811
900,0.1717
1000,0.173



EVALUATION: 47 splits, 1000 samples each, batch_size=6

Evaluating: image_test_chartqa (1000 samples, batch_size=6)
Task type: classification


image_test_chartqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:55<00:00,  1.05s/it]



üìä image_test_chartqa: 596/1000 = 59.60%
   ‚ùå True: 15.84 | Pred: 15.52 | Raw: '15.52'
   ‚úÖ True: 146 | Pred: 146 | Raw: '146'
   ‚ùå True: 10.37 | Pred: 11.06 | Raw: '11.06'
   ‚ùå True: 14722.84 | Pred: 14738.34 | Raw: '14738.34'
   ‚úÖ True: 75.82 | Pred: 75.82 | Raw: '75.82'

Evaluating: image_test_okvqa (841 samples, batch_size=6)
Task type: multiple_choice


image_test_okvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 141/141 [01:57<00:00,  1.20it/s]



üìä image_test_okvqa: 387/841 = 46.02%
   ‚úÖ True: river | Pred: river | Raw: 'river'
   ‚ùå True: carlo collodi | Pred: jeanne | Raw: 'jeanne disi'
   ‚ùå True: 1936 | Pred: 1938 | Raw: '1938'
   ‚úÖ True: airplane | Pred: airplane | Raw: 'airplane'
   ‚úÖ True: recreational | Pred: recreational | Raw: 'recreational'

Evaluating: image_test_scienceqa (518 samples, batch_size=6)
Task type: classification


image_test_scienceqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [01:00<00:00,  1.45it/s]



üìä image_test_scienceqa: 502/518 = 96.91%
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: image_test_seed_bench (500 samples, batch_size=6)
Task type: classification


image_test_seed_bench: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:59<00:00,  1.42it/s]



üìä image_test_seed_bench: 387/500 = 77.40%
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: d | Pred: d | Raw: 'D'
   ‚ùå True: b | Pred: d | Raw: 'D'
   ‚úÖ True: d | Pred: d | Raw: 'D'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: image_test_text_recognition (1000 samples, batch_size=6)
Task type: classification


image_test_text_recognition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:22<00:00,  1.17it/s]



üìä image_test_text_recognition: 900/1000 = 90.00%
   ‚úÖ True: exhibicionismus | Pred: exhibicionismus | Raw: 'EXHIBICIONISMUS'
   ‚úÖ True: delikatesem | Pred: delikatesem | Raw: 'Delikatesem'
   ‚úÖ True: vydojme | Pred: vydojme | Raw: 'Vydojme'
   ‚úÖ True: odlepena | Pred: odlepena | Raw: 'Odlepena'
   ‚úÖ True: normuj | Pred: normuj | Raw: 'normuj'

Evaluating: image_test_textvqa (1000 samples, batch_size=6)
Task type: multiple_choice


image_test_textvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:38<00:00,  1.05it/s]



üìä image_test_textvqa: 465/1000 = 46.50%
   ‚ùå True: 32 | Pred: 22 | Raw: '22'
   ‚ùå True: sopko | Pred: supko | Raw: 'supko'
   ‚úÖ True: 1970 | Pred: 1970 | Raw: '1970'
   ‚ùå True: over burning witches | Pred: over | Raw: 'over burning witches'
   ‚úÖ True: go | Pred: go | Raw: 'go'

Evaluating: image_test_vizwiz_vqa (417 samples, batch_size=6)
Task type: multiple_choice


image_test_vizwiz_vqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [01:03<00:00,  1.10it/s]



üìä image_test_vizwiz_vqa: 170/417 = 40.77%
   ‚ùå True: silver | Pred: grey | Raw: 'grey'
   ‚úÖ True: pink | Pred: pink | Raw: 'pink'
   ‚úÖ True: street | Pred: street | Raw: 'street'
   ‚ùå True: minnie riperton | Pred: minnie | Raw: 'minnie riperton'
   ‚ùå True: andes mints | Pred: cd | Raw: 'cd case'

Evaluating: image_test_vqa_rad (200 samples, batch_size=6)
Task type: multiple_choice


image_test_vqa_rad: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 34/34 [00:23<00:00,  1.46it/s]



üìä image_test_vqa_rad: 166/200 = 83.00%
   ‚úÖ True: false | Pred: false | Raw: 'no'
   ‚úÖ True: false | Pred: false | Raw: 'no'
   ‚ùå True: true | Pred: false | Raw: 'no'
   ‚úÖ True: true | Pred: true | Raw: 'yes'
   ‚ùå True: false | Pred: true | Raw: 'yes'

Evaluating: image_test_caltech101 (500 samples, batch_size=6)
Task type: classification


image_test_caltech101: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:07<00:00,  1.25it/s]



üìä image_test_caltech101: 389/500 = 77.80%
   ‚úÖ True: trilobite | Pred: trilobite | Raw: 'trilobite'
   ‚ùå True: inline skate | Pred: inline | Raw: 'inline skate'
   ‚úÖ True: emu | Pred: emu | Raw: 'emu'
   ‚ùå True: wild cat | Pred: wild | Raw: 'wild cat'
   ‚ùå True: inline skate | Pred: inline | Raw: 'inline skate'

Evaluating: image_test_eurosat (500 samples, batch_size=6)
Task type: classification


image_test_eurosat: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:56<00:00,  1.49it/s]



üìä image_test_eurosat: 294/500 = 58.80%
   ‚úÖ True: forest | Pred: forest | Raw: 'Forest'
   ‚úÖ True: forest | Pred: forest | Raw: 'Forest'
   ‚úÖ True: forest | Pred: forest | Raw: 'Forest'
   ‚úÖ True: forest | Pred: forest | Raw: 'Forest'
   ‚úÖ True: forest | Pred: forest | Raw: 'Forest'

Evaluating: image_test_flowers102 (500 samples, batch_size=6)
Task type: classification


image_test_flowers102: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:15<00:00,  1.12it/s]



üìä image_test_flowers102: 190/500 = 38.00%
   ‚úÖ True: petunia | Pred: petunia | Raw: 'petunia'
   ‚úÖ True: petunia | Pred: petunia | Raw: 'petunia'
   ‚ùå True: passion flower | Pred: passion | Raw: 'passion flower'
   ‚ùå True: passion flower | Pred: passion | Raw: 'passion flower'
   ‚úÖ True: petunia | Pred: petunia | Raw: 'petunia'

Evaluating: image_test_pets (500 samples, batch_size=6)
Task type: classification


image_test_pets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:11<00:00,  1.17it/s]



üìä image_test_pets: 235/500 = 47.00%
   ‚úÖ True: leonberger | Pred: leonberger | Raw: 'leonberger'
   ‚úÖ True: leonberger | Pred: leonberger | Raw: 'leonberger'
   ‚úÖ True: leonberger | Pred: leonberger | Raw: 'leonberger'
   ‚úÖ True: leonberger | Pred: leonberger | Raw: 'leonberger'
   ‚úÖ True: leonberger | Pred: leonberger | Raw: 'leonberger'

Evaluating: image_test_svhn (500 samples, batch_size=6)
Task type: classification


image_test_svhn: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:50<00:00,  1.65it/s]



üìä image_test_svhn: 484/500 = 96.80%
   ‚úÖ True: 1 | Pred: 1 | Raw: '1'
   ‚úÖ True: 1 | Pred: 1 | Raw: '1'
   ‚úÖ True: 1 | Pred: 1 | Raw: '1'
   ‚úÖ True: 1 | Pred: 1 | Raw: '1'
   ‚úÖ True: 1 | Pred: 1 | Raw: '1'

Evaluating: image_test_camelyon (500 samples, batch_size=6)
Task type: classification


image_test_camelyon: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:52<00:00,  1.59it/s]



üìä image_test_camelyon: 446/500 = 89.20%
   ‚úÖ True: tumor | Pred: tumor | Raw: 'tumor'
   ‚úÖ True: tumor | Pred: tumor | Raw: 'tumor'
   ‚úÖ True: tumor | Pred: tumor | Raw: 'tumor'
   ‚úÖ True: tumor | Pred: tumor | Raw: 'tumor'
   ‚ùå True: tumor | Pred: normal | Raw: 'normal'

Evaluating: text_test_arc_challenge (500 samples, batch_size=6)
Task type: classification


text_test_arc_challenge: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.50it/s]



üìä text_test_arc_challenge: 436/500 = 87.20%
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚ùå True: c | Pred: b | Raw: 'B'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚ùå True: c | Pred: b | Raw: 'B'

Evaluating: text_test_arc_easy (500 samples, batch_size=6)
Task type: classification


text_test_arc_easy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.59it/s]



üìä text_test_arc_easy: 473/500 = 94.60%
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: text_test_boolq (1000 samples, batch_size=6)
Task type: classification


text_test_boolq: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:18<00:00,  8.79it/s]



üìä text_test_boolq: 731/1000 = 73.10%
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚ùå True: a | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: text_test_hellaswag (1000 samples, batch_size=6)
Task type: classification


text_test_hellaswag: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.55it/s]



üìä text_test_hellaswag: 890/1000 = 89.00%
   ‚ùå True: d | Pred: b | Raw: 'B'
   ‚úÖ True: d | Pred: d | Raw: 'D'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: b | Pred: b | Raw: 'B'

Evaluating: text_test_openbookqa (500 samples, batch_size=6)
Task type: classification


text_test_openbookqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.67it/s]



üìä text_test_openbookqa: 451/500 = 90.20%
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚ùå True: c | Pred: b | Raw: 'B'

Evaluating: text_test_piqa (1000 samples, batch_size=6)
Task type: classification


text_test_piqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.57it/s]



üìä text_test_piqa: 873/1000 = 87.30%
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚ùå True: b | Pred: a | Raw: 'A'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: text_test_social_i_qa (1000 samples, batch_size=6)
Task type: classification


text_test_social_i_qa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.72it/s]



üìä text_test_social_i_qa: 815/1000 = 81.50%
   ‚úÖ True: c | Pred: c | Raw: 'C'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚ùå True: b | Pred: c | Raw: 'C'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: c | Pred: c | Raw: 'C'

Evaluating: text_test_winogrande (1000 samples, batch_size=6)
Task type: classification


text_test_winogrande: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.71it/s]



üìä text_test_winogrande: 853/1000 = 85.30%
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: b | Pred: b | Raw: 'B'
   ‚úÖ True: a | Pred: a | Raw: 'A'
   ‚úÖ True: a | Pred: a | Raw: 'A'

Evaluating: glue_test_sst2 (872 samples, batch_size=6)
Task type: classification


glue_test_sst2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 146/146 [00:16<00:00,  8.69it/s]



üìä glue_test_sst2: 834/872 = 95.64%
   ‚úÖ True: positive | Pred: positive | Raw: 'positive'
   ‚úÖ True: negative | Pred: negative | Raw: 'negative'
   ‚úÖ True: positive | Pred: positive | Raw: 'positive'
   ‚úÖ True: positive | Pred: positive | Raw: 'positive'
   ‚úÖ True: negative | Pred: negative | Raw: 'negative'

Evaluating: glue_test_qnli (1000 samples, batch_size=6)
Task type: classification


glue_test_qnli: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:46<00:00,  3.61it/s]



üìä glue_test_qnli: 925/1000 = 92.50%
   ‚úÖ True: entailment | Pred: entailment | Raw: 'entailment'
   ‚úÖ True: not_entailment | Pred: not_entailment | Raw: 'not_entailment'
   ‚ùå True: not_entailment | Pred: entailment | Raw: 'entailment'
   ‚úÖ True: entailment | Pred: entailment | Raw: 'entailment'
   ‚úÖ True: not_entailment | Pred: not_entailment | Raw: 'not_entailment'

Evaluating: glue_test_qqp (1000 samples, batch_size=6)
Task type: classification


glue_test_qqp: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:28<00:00,  5.93it/s]



üìä glue_test_qqp: 882/1000 = 88.20%
   ‚úÖ True: not_duplicate | Pred: not_duplicate | Raw: 'not_duplicate'
   ‚úÖ True: not_duplicate | Pred: not_duplicate | Raw: 'not_duplicate'
   ‚úÖ True: duplicate | Pred: duplicate | Raw: 'duplicate'
   ‚úÖ True: not_duplicate | Pred: not_duplicate | Raw: 'not_duplicate'
   ‚úÖ True: not_duplicate | Pred: not_duplicate | Raw: 'not_duplicate'

Evaluating: glue_test_cola (1000 samples, batch_size=6)
Task type: classification


glue_test_cola: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:26<00:00,  6.20it/s]



üìä glue_test_cola: 852/1000 = 85.20%
   ‚úÖ True: acceptable | Pred: acceptable | Raw: 'acceptable'
   ‚úÖ True: acceptable | Pred: acceptable | Raw: 'acceptable'
   ‚úÖ True: acceptable | Pred: acceptable | Raw: 'acceptable'
   ‚úÖ True: acceptable | Pred: acceptable | Raw: 'acceptable'
   ‚ùå True: unacceptable | Pred: acceptable | Raw: 'acceptable'

Evaluating: glue_test_mrpc (408 samples, batch_size=6)
Task type: classification


glue_test_mrpc: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 68/68 [00:14<00:00,  4.61it/s]



üìä glue_test_mrpc: 363/408 = 88.97%
   ‚úÖ True: equivalent | Pred: equivalent | Raw: 'equivalent'
   ‚úÖ True: not_equivalent | Pred: not_equivalent | Raw: 'not_equivalent'
   ‚úÖ True: not_equivalent | Pred: not_equivalent | Raw: 'not_equivalent'
   ‚úÖ True: equivalent | Pred: equivalent | Raw: 'equivalent'
   ‚úÖ True: not_equivalent | Pred: not_equivalent | Raw: 'not_equivalent'

Evaluating: glue_test_stsb (1000 samples, batch_size=6)
Task type: regression


glue_test_stsb: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:39<00:00,  1.05it/s]



üìä glue_test_stsb:
   Pearson:  0.9260
   Spearman: 0.9264
   Close (¬±0.5): 627/1000 = 62.70%
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 4.75 | Pred: 4.00 | Raw: '4.0'
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 2.40 | Pred: 3.00 | Raw: '3.0'
   ‚úÖ True: 2.75 | Pred: 2.80 | Raw: '2.799999952316284'

Evaluating: video_test_action_sequence (300 samples, batch_size=6)
Task type: classification


video_test_action_sequence:  16%|‚ñà‚ñå        | 8/50 [00:22<01:57,  2.81s/it]


KeyboardInterrupt: 

In [None]:
!nvidia-smi 

In [2]:
from evaluation1 import run_full_evaluation
results = run_full_evaluation(
    model=model,
    processor=processor,
    dataset=dataset,
    data_root=data_root,
    num_samples_per_split=1000,
    batch_size=6,
    max_new_tokens=50, 
    save_csv=save_csv,
)



EVALUATION: 47 splits, 1000 samples each, batch_size=6
Numeric tolerance: 0.5

Evaluating: image_test_chartqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_chartqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:10<00:00,  1.28it/s]



üìä image_test_chartqa: 724/1000 = 72.40%
   Match types: {'numeric': 66, 'exact': 657, 'no_match': 276, 'contains': 1}
   ‚úÖ True: '15.84' | Pred: '15.52' | Raw: '15.52' [numeric]
   ‚úÖ True: '146' | Pred: '146' | Raw: '146' [exact]
   ‚ùå True: '10.37' | Pred: '11.06' | Raw: '11.06' 
   ‚ùå True: '14722.84' | Pred: '14738.34' | Raw: '14738.34' 
   ‚úÖ True: '75.82' | Pred: '75.82' | Raw: '75.82' [exact]

Evaluating: image_test_okvqa (841 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_okvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 141/141 [01:28<00:00,  1.59it/s]



üìä image_test_okvqa: 476/841 = 56.60%
   Match types: {'exact': 453, 'no_match': 365, 'contains': 21, 'numeric': 2}
   ‚úÖ True: 'river' | Pred: 'river' | Raw: 'river' [exact]
   ‚ùå True: 'carlo collodi' | Pred: 'jeanne disi' | Raw: 'jeanne disi' 
   ‚ùå True: '1936' | Pred: '1938' | Raw: '1938' 
   ‚úÖ True: 'airplane' | Pred: 'airplane' | Raw: 'airplane' [exact]
   ‚úÖ True: 'recreational' | Pred: 'recreational' | Raw: 'recreational' [exact]

Evaluating: image_test_scienceqa (518 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_scienceqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [00:45<00:00,  1.92it/s]



üìä image_test_scienceqa: 502/518 = 96.91%
   Match types: {'exact': 502, 'no_match': 16}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_seed_bench (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_seed_bench: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:43<00:00,  1.92it/s]



üìä image_test_seed_bench: 387/500 = 77.40%
   Match types: {'exact': 387, 'no_match': 113}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_text_recognition (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_text_recognition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [01:55<00:00,  1.45it/s]



üìä image_test_text_recognition: 901/1000 = 90.10%
   Match types: {'exact': 897, 'no_match': 99, 'contains': 4}
   ‚úÖ True: 'exhibicionismus' | Pred: 'exhibicionismus' | Raw: 'EXHIBICIONISMUS' [exact]
   ‚úÖ True: 'delikatesem' | Pred: 'delikatesem' | Raw: 'Delikatesem' [exact]
   ‚úÖ True: 'vydojme' | Pred: 'vydojme' | Raw: 'Vydojme' [exact]
   ‚úÖ True: 'odlepena' | Pred: 'odlepena' | Raw: 'Odlepena' [exact]
   ‚ùå True: 'normuj' | Pred: 'false' | Raw: 'normuj' 

Evaluating: image_test_textvqa (1000 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_textvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:08<00:00,  1.30it/s]



üìä image_test_textvqa: 695/1000 = 69.50%
   Match types: {'no_match': 305, 'exact': 626, 'numeric': 19, 'contains': 50}
   ‚ùå True: '32' | Pred: '22' | Raw: '22' 
   ‚ùå True: 'sopko' | Pred: 'supko' | Raw: 'supko' 
   ‚úÖ True: '1970' | Pred: '1970' | Raw: '1970' [exact]
   ‚úÖ True: 'over burning witches' | Pred: 'over burning witches' | Raw: 'over burning witches' [exact]
   ‚úÖ True: 'go' | Pred: 'go' | Raw: 'go' [exact]

Evaluating: image_test_vizwiz_vqa (417 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vizwiz_vqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [00:50<00:00,  1.40it/s]



üìä image_test_vizwiz_vqa: 273/417 = 65.47%
   Match types: {'no_match': 144, 'exact': 249, 'numeric': 1, 'contains': 23}
   ‚ùå True: 'silver' | Pred: 'grey' | Raw: 'grey' 
   ‚úÖ True: 'pink' | Pred: 'pink' | Raw: 'pink' [exact]
   ‚úÖ True: 'street' | Pred: 'street' | Raw: 'street' [exact]
   ‚úÖ True: 'minnie riperton' | Pred: 'minnie riperton' | Raw: 'minnie riperton' [exact]
   ‚ùå True: 'andes mints' | Pred: 'cd case' | Raw: 'cd case' 

Evaluating: image_test_vqa_rad (200 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vqa_rad: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 34/34 [00:18<00:00,  1.88it/s]



üìä image_test_vqa_rad: 166/200 = 83.00%
   Match types: {'exact': 166, 'no_match': 34}
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚ùå True: 'true' | Pred: 'false' | Raw: 'no' 
   ‚úÖ True: 'true' | Pred: 'true' | Raw: 'yes' [exact]
   ‚ùå True: 'false' | Pred: 'true' | Raw: 'yes' 

Evaluating: image_test_caltech101 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_caltech101: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:51<00:00,  1.64it/s]



üìä image_test_caltech101: 479/500 = 95.80%
   Match types: {'exact': 472, 'contains': 7, 'no_match': 21}
   ‚úÖ True: 'trilobite' | Pred: 'trilobite' | Raw: 'trilobite' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]
   ‚úÖ True: 'emu' | Pred: 'emu' | Raw: 'emu' [exact]
   ‚úÖ True: 'wild cat' | Pred: 'wild cat' | Raw: 'wild cat' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]

Evaluating: image_test_eurosat (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_eurosat: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:45<00:00,  1.85it/s]



üìä image_test_eurosat: 479/500 = 95.80%
   Match types: {'exact': 479, 'no_match': 21}
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]

Evaluating: image_test_flowers102 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_flowers102: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:59<00:00,  1.40it/s]



üìä image_test_flowers102: 482/500 = 96.40%
   Match types: {'exact': 482, 'no_match': 18}
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]

Evaluating: image_test_pets (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_pets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:57<00:00,  1.46it/s]



üìä image_test_pets: 482/500 = 96.40%
   Match types: {'exact': 482, 'no_match': 18}
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]

Evaluating: image_test_svhn (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_svhn: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:37<00:00,  2.26it/s]



üìä image_test_svhn: 484/500 = 96.80%
   Match types: {'exact': 484, 'no_match': 16}
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]

Evaluating: image_test_camelyon (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_camelyon: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:41<00:00,  2.04it/s]



üìä image_test_camelyon: 216/500 = 43.20%
   Match types: {'exact': 216, 'no_match': 284}
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚ùå True: 'tumor' | Pred: 'false' | Raw: 'normal' 

Evaluating: text_test_arc_challenge (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_challenge: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.65it/s]



üìä text_test_arc_challenge: 436/500 = 87.20%
   Match types: {'exact': 436, 'no_match': 64}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 

Evaluating: text_test_arc_easy (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_easy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.67it/s]



üìä text_test_arc_easy: 473/500 = 94.60%
   Match types: {'exact': 473, 'no_match': 27}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_boolq (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_boolq: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.79it/s]



üìä text_test_boolq: 731/1000 = 73.10%
   Match types: {'exact': 731, 'no_match': 269}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'a' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_hellaswag (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_hellaswag: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.73it/s]



üìä text_test_hellaswag: 890/1000 = 89.00%
   Match types: {'no_match': 110, 'exact': 890}
   ‚ùå True: 'd' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: text_test_openbookqa (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_openbookqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.80it/s]



üìä text_test_openbookqa: 451/500 = 90.20%
   Match types: {'exact': 451, 'no_match': 49}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 

Evaluating: text_test_piqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_piqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.53it/s]



üìä text_test_piqa: 873/1000 = 87.30%
   Match types: {'exact': 873, 'no_match': 127}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'b' | Pred: 'a' | Raw: 'A' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_social_i_qa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_social_i_qa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.77it/s]



üìä text_test_social_i_qa: 815/1000 = 81.50%
   Match types: {'exact': 815, 'no_match': 185}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: text_test_winogrande (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_winogrande: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:18<00:00,  8.79it/s]



üìä text_test_winogrande: 853/1000 = 85.30%
   Match types: {'exact': 853, 'no_match': 147}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: glue_test_sst2 (872 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_sst2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 146/146 [00:16<00:00,  8.74it/s]



üìä glue_test_sst2: 834/872 = 95.64%
   Match types: {'exact': 834, 'no_match': 38}
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]

Evaluating: glue_test_qnli (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qnli: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:46<00:00,  3.63it/s]



üìä glue_test_qnli: 492/1000 = 49.20%
   Match types: {'exact': 445, 'no_match': 508, 'contains': 47}
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚ùå True: 'not_entailment' | Pred: 'false' | Raw: 'not_entailment' 
   ‚úÖ True: 'not_entailment' | Pred: 'entailment' | Raw: 'entailment' [contains]
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚ùå True: 'not_entailment' | Pred: 'false' | Raw: 'not_entailment' 

Evaluating: glue_test_qqp (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qqp: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:28<00:00,  5.95it/s]



üìä glue_test_qqp: 339/1000 = 33.90%
   Match types: {'no_match': 661, 'exact': 274, 'contains': 65}
   ‚ùå True: 'not_duplicate' | Pred: 'false' | Raw: 'not_duplicate' 
   ‚ùå True: 'not_duplicate' | Pred: 'false' | Raw: 'not_duplicate' 
   ‚úÖ True: 'duplicate' | Pred: 'duplicate' | Raw: 'duplicate' [exact]
   ‚ùå True: 'not_duplicate' | Pred: 'false' | Raw: 'not_duplicate' 
   ‚ùå True: 'not_duplicate' | Pred: 'false' | Raw: 'not_duplicate' 

Evaluating: glue_test_cola (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_cola: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:26<00:00,  6.25it/s]



üìä glue_test_cola: 1000/1000 = 100.00%
   Match types: {'exact': 917, 'contains': 83}
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'unacceptable' | Pred: 'acceptable' | Raw: 'acceptable' [contains]

Evaluating: glue_test_mrpc (408 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_mrpc: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 68/68 [00:14<00:00,  4.63it/s]



üìä glue_test_mrpc: 288/408 = 70.59%
   Match types: {'exact': 261, 'no_match': 120, 'contains': 27}
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚ùå True: 'not_equivalent' | Pred: 'false' | Raw: 'not_equivalent' 
   ‚ùå True: 'not_equivalent' | Pred: 'false' | Raw: 'not_equivalent' 
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚ùå True: 'not_equivalent' | Pred: 'false' | Raw: 'not_equivalent' 

Evaluating: glue_test_stsb (1000 samples, batch_size=6)
Task type: regression, Numeric tolerance: 0.5


glue_test_stsb: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:38<00:00,  1.05it/s]



üìä glue_test_stsb:
   Pearson:  0.9260
   Spearman: 0.9264
   Close (¬±0.5): 636/1000 = 63.60%
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 4.75 | Pred: 4.00 | Raw: '4.0'
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 2.40 | Pred: 3.00 | Raw: '3.0'
   ‚úÖ True: 2.75 | Pred: 2.80 | Raw: '2.799999952316284'

Evaluating: video_test_action_sequence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_sequence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:38<00:00,  1.96s/it]



üìä video_test_action_sequence: 111/300 = 37.00%
   Match types: {'no_match': 189, 'exact': 111}
   ‚ùå True: 'd' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'e' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'b' | Raw: 'B' 

Evaluating: video_test_action_prediction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_prediction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:47<00:00,  2.16s/it]



üìä video_test_action_prediction: 96/300 = 32.00%
   Match types: {'no_match': 204, 'exact': 96}
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: video_test_action_antonym (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_antonym: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:52<00:00,  1.04s/it]



üìä video_test_action_antonym: 223/300 = 74.33%
   Match types: {'exact': 223, 'no_match': 77}
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

Evaluating: video_test_fine_grained_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_fine_grained_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:06<00:00,  1.33s/it]



üìä video_test_fine_grained_action: 225/300 = 75.00%
   Match types: {'exact': 225, 'no_match': 75}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: video_test_unexpected_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_unexpected_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [07:12<00:00,  8.64s/it]



üìä video_test_unexpected_action: 142/300 = 47.33%
   Match types: {'no_match': 158, 'exact': 142}
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: video_test_object_existence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_existence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:58<00:00,  1.16s/it]



üìä video_test_object_existence: 270/300 = 90.00%
   Match types: {'exact': 270, 'no_match': 30}
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_object_interaction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_interaction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:47<00:00,  2.14s/it]



üìä video_test_object_interaction: 95/300 = 31.67%
   Match types: {'no_match': 205, 'exact': 95}
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'c' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_object_shuffle (300 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


video_test_object_shuffle:  12%|‚ñà‚ñè        | 6/50 [01:27<10:40, 14.56s/it]


KeyboardInterrupt: 

In [None]:
# ============================================
# Run this AFTER training, BEFORE evaluation
# ============================================

import gc
import torch



# 2. Clear gradients from model (important!)
model.zero_grad(set_to_none=True)

# 3. Make sure model is in eval mode (disables dropout, etc.)
model.eval()

# 4. Clear any cached gradient computation graphs
for param in model.parameters():
    param.grad = None

# 5. Garbage collect and clear CUDA cache
gc.collect()
torch.cuda.empty_cache()

# 6. Check memory
print(f"GPU allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"GPU reserved:  {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

GPU allocated: 4.55 GB
GPU reserved:  4.67 GB


In [4]:
from evaluation2 import run_full_evaluation
results = run_full_evaluation(
    model=model,
    processor=processor,
    dataset=dataset,
    data_root=data_root,
    num_samples_per_split=1000,
    batch_size=6,
    max_new_tokens=50, 
    save_csv=save_csv,
)



EVALUATION: 47 splits, 1000 samples each, batch_size=6
Numeric tolerance: 0.5

Evaluating: image_test_chartqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_chartqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:14<00:00,  1.24it/s]



üìä image_test_chartqa: 727/1000 = 72.70%
   Match types: {'numeric': 66, 'exact': 657, 'no_match': 273, 'contains': 4}
   ‚úÖ True: '15.84' | Pred: '15.52' | Raw: '15.52' [numeric]
   ‚úÖ True: '146' | Pred: '146' | Raw: '146' [exact]
   ‚ùå True: '10.37' | Pred: '11.06' | Raw: '11.06' 
   ‚ùå True: '14722.84' | Pred: '14738.34' | Raw: '14738.34' 
   ‚úÖ True: '75.82' | Pred: '75.82' | Raw: '75.82' [exact]

Evaluating: image_test_okvqa (841 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_okvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 141/141 [01:28<00:00,  1.59it/s]



üìä image_test_okvqa: 479/841 = 56.96%
   Match types: {'exact': 453, 'no_match': 362, 'contains': 24, 'numeric': 2}
   ‚úÖ True: 'river' | Pred: 'river' | Raw: 'river' [exact]
   ‚ùå True: 'carlo collodi' | Pred: 'jeanne' | Raw: 'jeanne disi' 
   ‚ùå True: '1936' | Pred: '1938' | Raw: '1938' 
   ‚úÖ True: 'airplane' | Pred: 'airplane' | Raw: 'airplane' [exact]
   ‚úÖ True: 'recreational' | Pred: 'recreational' | Raw: 'recreational' [exact]

Evaluating: image_test_scienceqa (518 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_scienceqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [00:45<00:00,  1.89it/s]



üìä image_test_scienceqa: 502/518 = 96.91%
   Match types: {'exact': 502, 'no_match': 16}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_seed_bench (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_seed_bench: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:43<00:00,  1.91it/s]



üìä image_test_seed_bench: 387/500 = 77.40%
   Match types: {'exact': 387, 'no_match': 113}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_text_recognition (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_text_recognition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [01:56<00:00,  1.43it/s]



üìä image_test_text_recognition: 907/1000 = 90.70%
   Match types: {'exact': 903, 'no_match': 93, 'contains': 4}
   ‚úÖ True: 'exhibicionismus' | Pred: 'exhibicionismus' | Raw: 'EXHIBICIONISMUS' [exact]
   ‚úÖ True: 'delikatesem' | Pred: 'delikatesem' | Raw: 'Delikatesem' [exact]
   ‚úÖ True: 'vydojme' | Pred: 'vydojme' | Raw: 'Vydojme' [exact]
   ‚úÖ True: 'odlepena' | Pred: 'odlepena' | Raw: 'Odlepena' [exact]
   ‚úÖ True: 'normuj' | Pred: 'normuj' | Raw: 'normuj' [exact]

Evaluating: image_test_textvqa (1000 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_textvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:09<00:00,  1.29it/s]



üìä image_test_textvqa: 732/1000 = 73.20%
   Match types: {'no_match': 268, 'exact': 630, 'contains': 83, 'numeric': 19}
   ‚ùå True: '32' | Pred: '22' | Raw: '22' 
   ‚ùå True: 'sopko' | Pred: 'supko' | Raw: 'supko' 
   ‚úÖ True: '1970' | Pred: '1970' | Raw: '1970' [exact]
   ‚úÖ True: 'over burning witches' | Pred: 'over burning witches' | Raw: 'over burning witches' [exact]
   ‚úÖ True: 'go' | Pred: 'go' | Raw: 'go' [exact]

Evaluating: image_test_vizwiz_vqa (417 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vizwiz_vqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [00:49<00:00,  1.40it/s]



üìä image_test_vizwiz_vqa: 291/417 = 69.78%
   Match types: {'no_match': 126, 'exact': 250, 'contains': 40, 'numeric': 1}
   ‚ùå True: 'silver' | Pred: 'grey' | Raw: 'grey' 
   ‚úÖ True: 'pink' | Pred: 'pink' | Raw: 'pink' [exact]
   ‚úÖ True: 'street' | Pred: 'street' | Raw: 'street' [exact]
   ‚úÖ True: 'minnie riperton' | Pred: 'minnie riperton' | Raw: 'minnie riperton' [exact]
   ‚ùå True: 'andes mints' | Pred: 'cd' | Raw: 'cd case' 

Evaluating: image_test_vqa_rad (200 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vqa_rad: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 34/34 [00:17<00:00,  1.90it/s]



üìä image_test_vqa_rad: 166/200 = 83.00%
   Match types: {'exact': 166, 'no_match': 34}
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚ùå True: 'true' | Pred: 'false' | Raw: 'no' 
   ‚úÖ True: 'true' | Pred: 'true' | Raw: 'yes' [exact]
   ‚ùå True: 'false' | Pred: 'true' | Raw: 'yes' 

Evaluating: image_test_caltech101 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_caltech101: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:51<00:00,  1.64it/s]



üìä image_test_caltech101: 479/500 = 95.80%
   Match types: {'exact': 472, 'contains': 7, 'no_match': 21}
   ‚úÖ True: 'trilobite' | Pred: 'trilobite' | Raw: 'trilobite' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]
   ‚úÖ True: 'emu' | Pred: 'emu' | Raw: 'emu' [exact]
   ‚úÖ True: 'wild cat' | Pred: 'wild cat' | Raw: 'wild cat' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]

Evaluating: image_test_eurosat (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_eurosat: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:45<00:00,  1.85it/s]



üìä image_test_eurosat: 479/500 = 95.80%
   Match types: {'exact': 479, 'no_match': 21}
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]

Evaluating: image_test_flowers102 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_flowers102: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:00<00:00,  1.39it/s]



üìä image_test_flowers102: 482/500 = 96.40%
   Match types: {'exact': 482, 'no_match': 18}
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]

Evaluating: image_test_pets (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_pets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:58<00:00,  1.43it/s]



üìä image_test_pets: 483/500 = 96.60%
   Match types: {'exact': 482, 'no_match': 17, 'contains': 1}
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]

Evaluating: image_test_svhn (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_svhn: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:38<00:00,  2.20it/s]



üìä image_test_svhn: 484/500 = 96.80%
   Match types: {'exact': 484, 'no_match': 16}
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]

Evaluating: image_test_camelyon (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_camelyon: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:42<00:00,  1.98it/s]



üìä image_test_camelyon: 446/500 = 89.20%
   Match types: {'exact': 446, 'no_match': 54}
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚ùå True: 'tumor' | Pred: 'normal' | Raw: 'normal' 

Evaluating: text_test_arc_challenge (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_challenge: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.63it/s]



üìä text_test_arc_challenge: 436/500 = 87.20%
   Match types: {'exact': 436, 'no_match': 64}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 

Evaluating: text_test_arc_easy (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_easy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.71it/s]



üìä text_test_arc_easy: 473/500 = 94.60%
   Match types: {'exact': 473, 'no_match': 27}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_boolq (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_boolq: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:18<00:00,  8.81it/s]



üìä text_test_boolq: 731/1000 = 73.10%
   Match types: {'exact': 731, 'no_match': 269}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'a' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_hellaswag (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_hellaswag: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.52it/s]



üìä text_test_hellaswag: 890/1000 = 89.00%
   Match types: {'no_match': 110, 'exact': 890}
   ‚ùå True: 'd' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: text_test_openbookqa (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_openbookqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:09<00:00,  8.59it/s]



üìä text_test_openbookqa: 451/500 = 90.20%
   Match types: {'exact': 451, 'no_match': 49}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 

Evaluating: text_test_piqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_piqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.51it/s]



üìä text_test_piqa: 873/1000 = 87.30%
   Match types: {'exact': 873, 'no_match': 127}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'b' | Pred: 'a' | Raw: 'A' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_social_i_qa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_social_i_qa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.74it/s]



üìä text_test_social_i_qa: 815/1000 = 81.50%
   Match types: {'exact': 815, 'no_match': 185}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: text_test_winogrande (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_winogrande: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:19<00:00,  8.72it/s]



üìä text_test_winogrande: 853/1000 = 85.30%
   Match types: {'exact': 853, 'no_match': 147}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: glue_test_sst2 (872 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_sst2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 146/146 [00:16<00:00,  8.67it/s]



üìä glue_test_sst2: 834/872 = 95.64%
   Match types: {'exact': 834, 'no_match': 38}
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]

Evaluating: glue_test_qnli (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qnli: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:46<00:00,  3.58it/s]



üìä glue_test_qnli: 953/1000 = 95.30%
   Match types: {'exact': 953, 'no_match': 47}
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚úÖ True: 'not_entailment' | Pred: 'not_entailment' | Raw: 'not_entailment' [exact]
   ‚ùå True: 'not_entailment' | Pred: 'entailment' | Raw: 'entailment' 
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚úÖ True: 'not_entailment' | Pred: 'not_entailment' | Raw: 'not_entailment' [exact]

Evaluating: glue_test_qqp (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qqp: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:28<00:00,  5.88it/s]



üìä glue_test_qqp: 882/1000 = 88.20%
   Match types: {'exact': 882, 'no_match': 118}
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'duplicate' | Pred: 'duplicate' | Raw: 'duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]

Evaluating: glue_test_cola (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_cola: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:26<00:00,  6.19it/s]



üìä glue_test_cola: 852/1000 = 85.20%
   Match types: {'exact': 852, 'no_match': 148}
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚ùå True: 'unacceptable' | Pred: 'acceptable' | Raw: 'acceptable' 

Evaluating: glue_test_mrpc (408 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_mrpc: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 68/68 [00:14<00:00,  4.60it/s]



üìä glue_test_mrpc: 363/408 = 88.97%
   Match types: {'exact': 363, 'no_match': 45}
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]

Evaluating: glue_test_stsb (1000 samples, batch_size=6)
Task type: regression, Numeric tolerance: 0.5


glue_test_stsb: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:37<00:00,  1.06it/s]



üìä glue_test_stsb:
   Pearson:  0.9260
   Spearman: 0.9264
   Close (¬±0.5): 636/1000 = 63.60%
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 4.75 | Pred: 4.00 | Raw: '4.0'
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 2.40 | Pred: 3.00 | Raw: '3.0'
   ‚úÖ True: 2.75 | Pred: 2.80 | Raw: '2.799999952316284'

Evaluating: video_test_action_sequence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_sequence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:18<00:00,  1.56s/it]



üìä video_test_action_sequence: 111/300 = 37.00%
   Match types: {'no_match': 189, 'exact': 111}
   ‚ùå True: 'd' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'e' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'b' | Raw: 'B' 

Evaluating: video_test_action_prediction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_prediction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:22<00:00,  1.65s/it]



üìä video_test_action_prediction: 96/300 = 32.00%
   Match types: {'no_match': 204, 'exact': 96}
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: video_test_action_antonym (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_antonym: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:37<00:00,  1.33it/s]



üìä video_test_action_antonym: 223/300 = 74.33%
   Match types: {'exact': 223, 'no_match': 77}
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

Evaluating: video_test_fine_grained_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_fine_grained_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:46<00:00,  1.08it/s]



üìä video_test_fine_grained_action: 225/300 = 75.00%
   Match types: {'exact': 225, 'no_match': 75}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: video_test_unexpected_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_unexpected_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [06:03<00:00,  7.27s/it]



üìä video_test_unexpected_action: 142/300 = 47.33%
   Match types: {'no_match': 158, 'exact': 142}
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: video_test_object_existence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_existence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:42<00:00,  1.17it/s]



üìä video_test_object_existence: 270/300 = 90.00%
   Match types: {'exact': 270, 'no_match': 30}
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_object_interaction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_interaction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:16<00:00,  1.53s/it]



üìä video_test_object_interaction: 95/300 = 31.67%
   Match types: {'no_match': 205, 'exact': 95}
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'c' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_object_shuffle (300 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


video_test_object_shuffle: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [09:53<00:00, 11.86s/it]



üìä video_test_object_shuffle: 99/300 = 33.00%
   Match types: {'no_match': 201, 'exact': 99}
   ‚ùå True: 'f' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_moving_direction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_direction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:01<00:00,  1.23s/it]



üìä video_test_moving_direction: 264/300 = 88.00%
   Match types: {'exact': 264, 'no_match': 36}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: video_test_action_localization (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_localization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:44<00:00,  4.48s/it]



üìä video_test_action_localization: 108/300 = 36.00%
   Match types: {'no_match': 192, 'exact': 108}
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'd' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_scene_transition (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_scene_transition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:26<00:00,  1.74s/it]



üìä video_test_scene_transition: 79/300 = 26.33%
   Match types: {'no_match': 221, 'exact': 79}
   ‚ùå True: 'a' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'e' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'a' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_action_count (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_count: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [08:40<00:00, 10.42s/it]



üìä video_test_action_count: 101/300 = 33.67%
   Match types: {'exact': 101, 'no_match': 199}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'd' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: video_test_moving_count (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_count: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:58<00:00,  1.16s/it]



üìä video_test_moving_count: 261/300 = 87.00%
   Match types: {'exact': 261, 'no_match': 39}
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

Evaluating: video_test_moving_attribute (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_attribute: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:58<00:00,  1.17s/it]



üìä video_test_moving_attribute: 268/300 = 89.33%
   Match types: {'exact': 268, 'no_match': 32}
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]

Evaluating: video_test_state_change (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_state_change: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:05<00:00, 12.10s/it]



üìä video_test_state_change: 117/300 = 39.00%
   Match types: {'exact': 117, 'no_match': 183}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'f' | Pred: 'c' | Raw: 'C' 

Evaluating: video_test_character_order (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_character_order: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:13<00:00, 12.26s/it]



üìä video_test_character_order: 95/300 = 31.67%
   Match types: {'exact': 95, 'no_match': 205}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'c' | Pred: 'a' | Raw: 'A' 

Evaluating: video_test_egocentric_navigation (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_egocentric_navigation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:54<00:00,  1.08s/it]



üìä video_test_egocentric_navigation: 205/300 = 68.33%
   Match types: {'exact': 205, 'no_match': 95}
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: video_test_episodic_reasoning (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_episodic_reasoning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:35<00:00,  1.92s/it]



üìä video_test_episodic_reasoning: 86/300 = 28.67%
   Match types: {'no_match': 214, 'exact': 86}
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'f' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'd' | Pred: 'b' | Raw: 'B' 

Evaluating: video_test_counterfactual_inference (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_counterfactual_inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:01<00:00,  1.23s/it]


üìä video_test_counterfactual_inference: 261/300 = 87.00%
   Match types: {'exact': 261, 'no_match': 39}
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

SUMMARY

üìÅ GLUE_TEST:
   Classification: 3884/4280 = 90.75%
     - glue_test_sst2: 95.64%
     - glue_test_qnli: 95.30%
     - glue_test_qqp: 88.20%
     - glue_test_cola: 85.20%
     - glue_test_mrpc: 88.97%
   Regression: Pearson=0.9260, Spearman=0.9264
     - glue_test_stsb: Pearson=0.9260, Spearman=0.9264

üìÅ IMAGE_TEST:
   Classification: 7044/8476 = 83.11%
     - image_test_chartqa: 72.70%
     - image_test_okvqa: 56.96%
     - image_test_scienceqa: 96.91%
     - image_test_seed_bench: 77.40%
     - image_test_text_recognition: 90.70%
     - image_test_textvqa: 73.20%
     - image_test_vizwiz_vqa: 69.78%
     - image_test_vqa_rad: 83.00%
  




In [5]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint()'))
print("‚úÖ Notebook saved!")

<IPython.core.display.Javascript object>

‚úÖ Notebook saved!
