In [1]:

from transformers import LlavaOnevisionForConditionalGeneration, LlavaOnevisionProcessor
from utils import MultiModalDataset, MultiModalCollator
import torch
from torch.utils.data import Dataset
from PIL import Image
import av
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any
import os


from trainer import print_LiME_summary, LiMEArguments, LiMETrainer

from transformers import TrainingArguments
from evaluation import run_full_evaluation

from datasets import load_from_disk
from transformers import TrainingArguments, Trainer

    
from torch import nn



from LiMELoRAFA import apply_peft

cache_dir = "/ocean/projects/cis250258p/mkowsher/hf_cache"
model_name = "/ocean/projects/cis250258p/mkowsher/hf_cache/llava-onevision-qwen2-7b-ov-hf"
save_csv="evaluation_results_lorafa.csv"
data_root="/ocean/projects/cis250258p/mkowsher/dataset/MVBench"
dataset_name="hf_mvbench_updated"

# Load processor
processor = LlavaOnevisionProcessor.from_pretrained(model_name, cache_dir=cache_dir)

# ‚≠ê CRITICAL: Set fixed resolution BEFORE using processor
processor.image_processor.image_grid_pinpoints = [[384, 384]]
processor.image_processor.size = {"height": 384, "width": 384}
processor.tokenizer.padding_side = "left" 

if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# Load model
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    cache_dir=cache_dir,
)


# ‚≠ê CRITICAL: Update model config to match
model.config.image_grid_pinpoints = [[384, 384]]

# ‚≠ê VERIFY the settings
print("="*50)
print("VERIFICATION:")
print("="*50)
print(f"Processor image_grid_pinpoints: {processor.image_processor.image_grid_pinpoints}")
print(f"Model config image_grid_pinpoints: {model.config.image_grid_pinpoints}")
print(f"Vision config image_size: {model.config.vision_config.image_size}")
print("="*50)

model.gradient_checkpointing_enable()

# Load dataset
dataset = load_from_disk(dataset_name)

train_dataset = MultiModalDataset(
    dataset=dataset['train'],
    processor=processor,
    data_root=data_root,
    num_video_frames=8,
    max_length=2048,
)
collator = MultiModalCollator(
    processor=processor,
    max_length=2048,
)

# ============ TEST FIRST ============
print("\n" + "="*50)
print("TESTING SINGLE SAMPLES:")
print("="*50)

# Test one of each modality
for i in range(min(50, len(train_dataset))):
    sample = train_dataset[i]
    src_type = sample.get('source_type', 'unknown')
    print(f"\nSample {i} ({src_type}):")
    print(f"  input_ids: {sample['input_ids'].shape}")
    if 'pixel_values' in sample:
        print(f"  pixel_values: {sample['pixel_values'].shape}")
    if 'image_sizes' in sample:
        print(f"  image_sizes: {sample['image_sizes']}")
    if 'pixel_values_videos' in sample:
        print(f"  pixel_values_videos: {sample['pixel_values_videos'].shape}")
    
    # Stop after finding one of each
    if i > 10:
        break

# Test collator with small batch
print("\n" + "="*50)
print("TESTING COLLATOR:")
print("="*50)

test_samples = [train_dataset[i] for i in range(4)]
test_batch = collator(test_samples)
print("Batch keys:", list(test_batch.keys()))
for k, v in test_batch.items():
    if isinstance(v, torch.Tensor):
        print(f"  {k}: {v.shape}")



import torch

# Collect exactly 20 samples from the PROCESSED dataset
samples = []
for i in range(20):
    samples.append(train_dataset[i])  # ‚úÖ Use train_dataset, not dataset['train']

# Track modalities before collating
modalities = [s['source_type'] for s in samples]

# Run through collator
batch = collator(samples)

# Inspect tensor shapes
print("="*70)
print("BATCH TENSOR SHAPES:")
print("="*70)
for k, v in batch.items():
    if isinstance(v, torch.Tensor):
        print(f"{k}: {v.shape}, dtype={v.dtype}")
    elif isinstance(v, list):
        print(f"{k}: list of {len(v)} items")

# Check modalities
print("\n" + "="*70)
print("SAMPLE MODALITIES:")
print("="*70)
for i, mod in enumerate(modalities):
    print(f"Sample {i}: {mod}")

# Count by modality
from collections import Counter
mod_counts = Counter(modalities)
print(f"\nModality distribution: {dict(mod_counts)}")

# Check visual data
print("\n" + "="*70)
print("VISUAL DATA INFO:")
print("="*70)
if 'pixel_values' in batch:
    print(f"pixel_values (images): {batch['pixel_values'].shape}")
    print(f"  - Number of image samples in batch: {modalities.count('image')}")
else:
    print("No images in this batch")

if 'pixel_values_videos' in batch:
    print(f"pixel_values_videos: {batch['pixel_values_videos'].shape}")
    print(f"  - Number of video samples in batch: {modalities.count('video')}")
else:
    print("No videos in this batch")

if 'image_sizes' in batch:
    print(f"image_sizes: {batch['image_sizes'].shape}")

# Decode and inspect
print("\n" + "="*70)
print("DECODED TEXT (last 150 tokens of each sample):")
print("="*70)

for i in range(min(10, len(samples))):  # First 10 for readability
    modality = modalities[i]
    
    # Get last 150 tokens
    last_tokens = batch["input_ids"][i][-150:]
    decoded = processor.tokenizer.decode(last_tokens, skip_special_tokens=False)
    
    # Get labels
    labels = batch["labels"][i]
    label_tokens = labels[labels != -100]
    label_text = processor.tokenizer.decode(label_tokens, skip_special_tokens=False) if len(label_tokens) > 0 else "[NO LABELS]"
    
    print(f"\n{'='*70}")
    print(f"SAMPLE {i} | Modality: {modality}")
    print(f"{'='*70}")
    print(f"Last 150 tokens decoded:\n{decoded}")
    print(f"\nüìù LABELS (what model learns to predict):\n{label_text}")
    print(f"Label token count: {len(label_tokens)}")

# Verify token-feature alignment for images
print("\n" + "="*70)
print("TOKEN-FEATURE ALIGNMENT CHECK:")
print("="*70)
image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
video_token_id = processor.tokenizer.convert_tokens_to_ids("<video>")

total_image_tokens = (batch['input_ids'] == image_token_id).sum().item()
total_video_tokens = (batch['input_ids'] == video_token_id).sum().item()

print(f"Total <image> tokens in batch: {total_image_tokens}")
print(f"Total <video> tokens in batch: {total_video_tokens}")

if 'pixel_values' in batch:
    # For fixed 384x384, each image = 1 patch = 729 features (27x27) after pooling
    # But depends on model config
    print(f"pixel_values shape: {batch['pixel_values'].shape}")
    
if 'pixel_values_videos' in batch:
    print(f"pixel_values_videos shape: {batch['pixel_values_videos'].shape}")
    
    


targets = ["q_proj", "k_proj", "v_proj", "o_proj", "out_proj"]


def lm_only_targets(path, module):
    # Only wrap modules under model.language_model
    if not path.startswith("model.language_model."):
        return False

    # Expect paths like: model.language_model.layers.<idx>.*
    if ".layers." not in path:
        return False

    try:
        layer_id = int(path.split(".layers.")[1].split(".")[0])
    except (IndexError, ValueError):
        return False


    return any(path.endswith(name) for name in targets)

model = apply_peft(
    model,
    targets=targets,
    num_experts=4,
    rank=2,
    use_shared_LiME=True,
    n_gram=2,
    top_k=1,
    rep_mode="token",
    jitter_noise=0.1,
    tokenizer=processor.tokenizer,
    temperature=0.5,
    gamma_routing = 0.7, 
    auto_topk=True, 
    auto_topk_threshold=0.5, 
    peft_dtype=torch.float32,   # A, B in float32
    moe_dtype=torch.float32,    # moe3s, gamma in float32

)

print_LiME_summary(model)

# 1) Count params that require grad
trainable = [(n, p) for n, p in model.named_parameters() if p.requires_grad]
total = sum(p.numel() for _, p in model.named_parameters())
trainable_num = sum(p.numel() for _, p in trainable)
print(f"trainable params: {trainable_num:,} / {total:,}")




#model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
training_args = LiMEArguments(
    output_dir="./llava-lora-finetuned",
    per_device_train_batch_size=5,
    gradient_accumulation_steps=4,  
    save_total_limit=2,
    save_steps=500000,
    num_train_epochs=2,
    bf16=True,  
    logging_dir="./logs",
    logging_steps=100,
    remove_unused_columns=False, 

    eval_steps=100,
    save_strategy="steps",
    optim="adamw_bnb_8bit",
    learning_rate=2e-4,
    warmup_ratio=0.03,
    weight_decay=0.01,
    report_to="none",
    disable_tqdm=False,          # makes it print log lines instead of tqdm bar behavior
    #log_level="info",
    #logging_first_step=True,
    moe_lr=1e-3,          # For propulsions, gamma (float32)
    peft_lr=4e-4,         # For LoRA A/B (float32)
    importance_coef=0.1,
    kl_coef=0.01,
  
)

# Also make sure model doesn't have it enabled
model.gradient_checkpointing_disable()  # ‚≠ê Call this explicitly



# Example instantiation:
trainer = LiMETrainer(
    model=model,
    args=training_args,                  # your HF TrainingArguments
    train_dataset=train_dataset,

    tokenizer=processor.tokenizer,
    data_collator=collator,  # ‚úÖ Custom collator dynamically pads batch sequences


)
trainer.train() 



    





  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:06<00:00,  1.73s/it]


VERIFICATION:
Processor image_grid_pinpoints: [[384, 384]]
Model config image_grid_pinpoints: [[384, 384]]
Vision config image_size: 384

TESTING SINGLE SAMPLES:

Sample 0 (image):
  input_ids: torch.Size([1509])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 1 (text):
  input_ids: torch.Size([59])

Sample 2 (image):
  input_ids: torch.Size([1508])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 3 (video):
  input_ids: torch.Size([1614])
  pixel_values_videos: torch.Size([8, 3, 384, 384])

Sample 4 (image):
  input_ids: torch.Size([1525])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([512, 512])

Sample 5 (text):
  input_ids: torch.Size([50])

Sample 6 (image):
  input_ids: torch.Size([1522])
  pixel_values: torch.Size([2, 3, 384, 384])
  image_sizes: tensor([32, 32])

Sample 7 (text):
  input_ids: torch.Size([83])

Sample 8 (text):
  input_ids: torch.Size([48])

Sample 9 (text):
  input

  super().__init__(*args, **kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


BATCH TENSOR SHAPES:
input_ids: torch.Size([20, 1614]), dtype=torch.int64
attention_mask: torch.Size([20, 1614]), dtype=torch.int64
labels: torch.Size([20, 1614]), dtype=torch.int64
pixel_values: torch.Size([10, 3, 384, 384]), dtype=torch.float32
image_sizes: torch.Size([5, 2]), dtype=torch.int64
pixel_values_videos: torch.Size([3, 8, 3, 384, 384]), dtype=torch.float32

SAMPLE MODALITIES:
Sample 0: image
Sample 1: text
Sample 2: image
Sample 3: video
Sample 4: image
Sample 5: text
Sample 6: image
Sample 7: text
Sample 8: text
Sample 9: text
Sample 10: text
Sample 11: video
Sample 12: text
Sample 13: text
Sample 14: text
Sample 15: text
Sample 16: video
Sample 17: text
Sample 18: image
Sample 19: text

Modality distribution: {'image': 5, 'text': 12, 'video': 3}

VISUAL DATA INFO:
pixel_values (images): torch.Size([10, 3, 384, 384])
  - Number of image samples in batch: 5
pixel_values_videos: torch.Size([3, 8, 3, 384, 384])
  - Number of video samples in batch: 3
image_sizes: torch.Size(

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,6.2625
200,0.4858
300,0.2113
400,0.207
500,0.1934
600,0.1792
700,0.1675
800,0.1829
900,0.1701
1000,0.1752


TrainOutput(global_step=15862, training_loss=0.15963918629758012, metrics={'train_runtime': 90320.8397, 'train_samples_per_second': 3.512, 'train_steps_per_second': 0.176, 'total_flos': 2.035490262486711e+19, 'train_loss': 0.15963918629758012, 'task_loss': 0.009864027611911297, 'balance_loss': 0.0005144942551851273, 'epoch': 2.0})

In [2]:
from evaluation import run_full_evaluation
results = run_full_evaluation(
    model=model,
    processor=processor,
    dataset=dataset,
    data_root=data_root,
    num_samples_per_split=1000,
    batch_size=6,
    max_new_tokens=50, 
    save_csv=save_csv,
)



EVALUATION: 47 splits, 1000 samples each, batch_size=6
Numeric tolerance: 0.5

Evaluating: image_test_chartqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_chartqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [03:20<00:00,  1.20s/it]



üìä image_test_chartqa: 750/1000 = 75.00%
   Match types: {'no_match': 250, 'exact': 688, 'numeric': 58, 'contains': 4}
   ‚ùå True: '15.84' | Pred: '15.09' | Raw: '15.09' 
   ‚úÖ True: '146' | Pred: '146' | Raw: '146' [exact]
   ‚ùå True: '10.37' | Pred: '11.32' | Raw: '11.32' 
   ‚ùå True: '14722.84' | Pred: '14738.81' | Raw: '14738.81' 
   ‚úÖ True: '75.82' | Pred: '75.82' | Raw: '75.82' [exact]

Evaluating: image_test_okvqa (841 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_okvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 141/141 [02:20<00:00,  1.01it/s]



üìä image_test_okvqa: 489/841 = 58.15%
   Match types: {'exact': 464, 'no_match': 352, 'contains': 24, 'numeric': 1}
   ‚úÖ True: 'river' | Pred: 'river' | Raw: 'river' [exact]
   ‚ùå True: 'carlo collodi' | Pred: 'gustave' | Raw: 'gustave flaubert' 
   ‚ùå True: '1936' | Pred: '1938' | Raw: '1938' 
   ‚úÖ True: 'airplane' | Pred: 'airplane' | Raw: 'airplane' [exact]
   ‚úÖ True: 'recreational' | Pred: 'recreational' | Raw: 'recreational' [exact]

Evaluating: image_test_scienceqa (518 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_scienceqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 87/87 [01:05<00:00,  1.33it/s]



üìä image_test_scienceqa: 509/518 = 98.26%
   Match types: {'exact': 509, 'no_match': 9}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_seed_bench (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_seed_bench: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:02<00:00,  1.35it/s]



üìä image_test_seed_bench: 382/500 = 76.40%
   Match types: {'exact': 382, 'no_match': 118}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: image_test_text_recognition (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_text_recognition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:35<00:00,  1.07it/s]



üìä image_test_text_recognition: 882/1000 = 88.20%
   Match types: {'no_match': 118, 'exact': 874, 'contains': 8}
   ‚ùå True: 'exhibicionismus' | Pred: 'exhibicionisms' | Raw: 'EXHIBICIONISMS' 
   ‚úÖ True: 'delikatesem' | Pred: 'delikatesem' | Raw: 'Delikatesem' [exact]
   ‚úÖ True: 'vydojme' | Pred: 'vydojme' | Raw: 'Vydojme' [exact]
   ‚úÖ True: 'odlepena' | Pred: 'odlepena' | Raw: 'Odlepena' [exact]
   ‚úÖ True: 'normuj' | Pred: 'normuj' | Raw: 'normuj' [exact]

Evaluating: image_test_textvqa (1000 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_textvqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [02:57<00:00,  1.06s/it]



üìä image_test_textvqa: 721/1000 = 72.10%
   Match types: {'no_match': 279, 'exact': 628, 'contains': 75, 'numeric': 18}
   ‚ùå True: '32' | Pred: '22' | Raw: '22' 
   ‚úÖ True: 'sopko' | Pred: 'sopko' | Raw: 'sopko' [exact]
   ‚úÖ True: '1970' | Pred: '1970' | Raw: '1970' [exact]
   ‚úÖ True: 'over burning witches' | Pred: 'over burning witches' | Raw: 'over burning witches' [exact]
   ‚úÖ True: 'go' | Pred: 'go' | Raw: 'go' [exact]

Evaluating: image_test_vizwiz_vqa (417 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vizwiz_vqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 70/70 [01:15<00:00,  1.08s/it]



üìä image_test_vizwiz_vqa: 287/417 = 68.82%
   Match types: {'exact': 239, 'no_match': 130, 'contains': 46, 'numeric': 2}
   ‚úÖ True: 'silver' | Pred: 'silver' | Raw: 'silver' [exact]
   ‚úÖ True: 'pink' | Pred: 'pink' | Raw: 'pink' [exact]
   ‚úÖ True: 'street' | Pred: 'street' | Raw: 'street' [exact]
   ‚ùå True: 'minnie riperton' | Pred: 'minette' | Raw: 'minette reperion' 
   ‚ùå True: 'andes mints' | Pred: 'cd' | Raw: 'cd case' 

Evaluating: image_test_vqa_rad (200 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


image_test_vqa_rad: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 34/34 [00:24<00:00,  1.37it/s]



üìä image_test_vqa_rad: 166/200 = 83.00%
   Match types: {'exact': 166, 'no_match': 34}
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚úÖ True: 'false' | Pred: 'false' | Raw: 'no' [exact]
   ‚ùå True: 'true' | Pred: 'false' | Raw: 'no' 
   ‚úÖ True: 'true' | Pred: 'true' | Raw: 'yes' [exact]
   ‚ùå True: 'false' | Pred: 'true' | Raw: 'yes' 

Evaluating: image_test_caltech101 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_caltech101: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:09<00:00,  1.21it/s]



üìä image_test_caltech101: 481/500 = 96.20%
   Match types: {'exact': 473, 'contains': 8, 'no_match': 19}
   ‚úÖ True: 'trilobite' | Pred: 'trilobite' | Raw: 'trilobite' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]
   ‚úÖ True: 'emu' | Pred: 'emu' | Raw: 'emu' [exact]
   ‚úÖ True: 'wild cat' | Pred: 'wild cat' | Raw: 'wild cat' [exact]
   ‚úÖ True: 'inline skate' | Pred: 'inline skate' | Raw: 'inline skate' [exact]

Evaluating: image_test_eurosat (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_eurosat: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:04<00:00,  1.31it/s]



üìä image_test_eurosat: 479/500 = 95.80%
   Match types: {'exact': 479, 'no_match': 21}
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]
   ‚úÖ True: 'forest' | Pred: 'forest' | Raw: 'Forest' [exact]

Evaluating: image_test_flowers102 (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_flowers102: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:22<00:00,  1.02it/s]



üìä image_test_flowers102: 477/500 = 95.40%
   Match types: {'exact': 477, 'no_match': 23}
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'passion flower' | Pred: 'passion flower' | Raw: 'passion flower' [exact]
   ‚úÖ True: 'petunia' | Pred: 'petunia' | Raw: 'petunia' [exact]

Evaluating: image_test_pets (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_pets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [01:21<00:00,  1.03it/s]



üìä image_test_pets: 483/500 = 96.60%
   Match types: {'exact': 481, 'no_match': 17, 'contains': 2}
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]
   ‚úÖ True: 'leonberger' | Pred: 'leonberger' | Raw: 'leonberger' [exact]

Evaluating: image_test_svhn (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_svhn: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:54<00:00,  1.55it/s]



üìä image_test_svhn: 475/500 = 95.00%
   Match types: {'exact': 475, 'no_match': 25}
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]
   ‚úÖ True: '1' | Pred: '1' | Raw: '1' [exact]

Evaluating: image_test_camelyon (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


image_test_camelyon: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:57<00:00,  1.47it/s]



üìä image_test_camelyon: 438/500 = 87.60%
   Match types: {'exact': 438, 'no_match': 62}
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚úÖ True: 'tumor' | Pred: 'tumor' | Raw: 'tumor' [exact]
   ‚ùå True: 'tumor' | Pred: 'normal' | Raw: 'normal' 

Evaluating: text_test_arc_challenge (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_challenge: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:11<00:00,  7.05it/s]



üìä text_test_arc_challenge: 443/500 = 88.60%
   Match types: {'exact': 443, 'no_match': 57}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'b' | Raw: 'B' 

Evaluating: text_test_arc_easy (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_arc_easy: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:11<00:00,  7.25it/s]



üìä text_test_arc_easy: 471/500 = 94.20%
   Match types: {'exact': 471, 'no_match': 29}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_boolq (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_boolq: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:22<00:00,  7.41it/s]



üìä text_test_boolq: 721/1000 = 72.10%
   Match types: {'no_match': 279, 'exact': 721}
   ‚ùå True: 'b' | Pred: 'a' | Raw: 'A' 
   ‚ùå True: 'a' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'a' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_hellaswag (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_hellaswag: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:23<00:00,  7.25it/s]



üìä text_test_hellaswag: 881/1000 = 88.10%
   Match types: {'no_match': 119, 'exact': 881}
   ‚ùå True: 'd' | Pred: 'b' | Raw: 'B' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'c' | Pred: 'a' | Raw: 'A' 
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: text_test_openbookqa (500 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_openbookqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 84/84 [00:11<00:00,  7.28it/s]



üìä text_test_openbookqa: 448/500 = 89.60%
   Match types: {'exact': 448, 'no_match': 52}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: text_test_piqa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_piqa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:22<00:00,  7.28it/s]



üìä text_test_piqa: 874/1000 = 87.40%
   Match types: {'exact': 874, 'no_match': 126}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: text_test_social_i_qa (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_social_i_qa: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:22<00:00,  7.39it/s]



üìä text_test_social_i_qa: 807/1000 = 80.70%
   Match types: {'exact': 807, 'no_match': 193}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'b' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: text_test_winogrande (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


text_test_winogrande: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:22<00:00,  7.42it/s]



üìä text_test_winogrande: 821/1000 = 82.10%
   Match types: {'exact': 821, 'no_match': 179}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'a' | Pred: 'b' | Raw: 'B' 

Evaluating: glue_test_sst2 (872 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_sst2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 146/146 [00:19<00:00,  7.34it/s]



üìä glue_test_sst2: 827/872 = 94.84%
   Match types: {'exact': 827, 'no_match': 45}
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'positive' | Pred: 'positive' | Raw: 'positive' [exact]
   ‚úÖ True: 'negative' | Pred: 'negative' | Raw: 'negative' [exact]

Evaluating: glue_test_qnli (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qnli: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:55<00:00,  3.01it/s]



üìä glue_test_qnli: 946/1000 = 94.60%
   Match types: {'exact': 946, 'no_match': 54}
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚úÖ True: 'not_entailment' | Pred: 'not_entailment' | Raw: 'not_entailment' [exact]
   ‚ùå True: 'not_entailment' | Pred: 'entailment' | Raw: 'entailment' 
   ‚úÖ True: 'entailment' | Pred: 'entailment' | Raw: 'entailment' [exact]
   ‚úÖ True: 'not_entailment' | Pred: 'not_entailment' | Raw: 'not_entailment' [exact]

Evaluating: glue_test_qqp (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_qqp: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:33<00:00,  4.98it/s]



üìä glue_test_qqp: 861/1000 = 86.10%
   Match types: {'exact': 861, 'no_match': 139}
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'duplicate' | Pred: 'duplicate' | Raw: 'duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]
   ‚úÖ True: 'not_duplicate' | Pred: 'not_duplicate' | Raw: 'not_duplicate' [exact]

Evaluating: glue_test_cola (1000 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_cola: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [00:22<00:00,  7.37it/s]



üìä glue_test_cola: 691/1000 = 69.10%
   Match types: {'exact': 691, 'no_match': 309}
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚úÖ True: 'acceptable' | Pred: 'acceptable' | Raw: 'acceptable' [exact]
   ‚ùå True: 'unacceptable' | Pred: 'acceptable' | Raw: 'acceptable' 

Evaluating: glue_test_mrpc (408 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


glue_test_mrpc: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 68/68 [00:17<00:00,  3.86it/s]



üìä glue_test_mrpc: 359/408 = 87.99%
   Match types: {'exact': 359, 'no_match': 49}
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]
   ‚úÖ True: 'equivalent' | Pred: 'equivalent' | Raw: 'equivalent' [exact]
   ‚úÖ True: 'not_equivalent' | Pred: 'not_equivalent' | Raw: 'not_equivalent' [exact]

Evaluating: glue_test_stsb (1000 samples, batch_size=6)
Task type: regression, Numeric tolerance: 0.5


glue_test_stsb: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 167/167 [03:11<00:00,  1.15s/it]



üìä glue_test_stsb:
   Pearson:  0.9196
   Spearman: 0.9213
   Close (¬±0.5): 610/1000 = 61.00%
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚ùå True: 4.75 | Pred: 4.00 | Raw: '4.0'
   ‚úÖ True: 5.00 | Pred: 5.00 | Raw: '5.0'
   ‚úÖ True: 2.40 | Pred: 2.00 | Raw: '2.0'
   ‚ùå True: 2.75 | Pred: 2.20 | Raw: '2.200000047683716'

Evaluating: video_test_action_sequence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_sequence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:57<00:00,  2.36s/it]



üìä video_test_action_sequence: 107/300 = 35.67%
   Match types: {'exact': 107, 'no_match': 193}
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'e' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'e' | Pred: 'c' | Raw: 'C' 
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]

Evaluating: video_test_action_prediction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_prediction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:59<00:00,  2.39s/it]



üìä video_test_action_prediction: 103/300 = 34.33%
   Match types: {'no_match': 197, 'exact': 103}
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: video_test_action_antonym (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_antonym: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:04<00:00,  1.28s/it]



üìä video_test_action_antonym: 212/300 = 70.67%
   Match types: {'no_match': 88, 'exact': 212}
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

Evaluating: video_test_fine_grained_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_fine_grained_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:19<00:00,  1.59s/it]



üìä video_test_fine_grained_action: 217/300 = 72.33%
   Match types: {'exact': 217, 'no_match': 83}
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚ùå True: 'c' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'c' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_unexpected_action (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_unexpected_action: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [07:41<00:00,  9.22s/it]



üìä video_test_unexpected_action: 145/300 = 48.33%
   Match types: {'no_match': 155, 'exact': 145}
   ‚ùå True: 'b' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]

Evaluating: video_test_object_existence (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_existence: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:04<00:00,  1.29s/it]



üìä video_test_object_existence: 268/300 = 89.33%
   Match types: {'no_match': 32, 'exact': 268}
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_object_interaction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_object_interaction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:44<00:00,  2.09s/it]



üìä video_test_object_interaction: 91/300 = 30.33%
   Match types: {'no_match': 209, 'exact': 91}
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'b' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'f' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_object_shuffle (300 samples, batch_size=6)
Task type: multiple_choice, Numeric tolerance: 0.5


video_test_object_shuffle: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:32<00:00, 12.65s/it]



üìä video_test_object_shuffle: 99/300 = 33.00%
   Match types: {'no_match': 201, 'exact': 99}
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_moving_direction (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_direction: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:05<00:00,  1.30s/it]



üìä video_test_moving_direction: 266/300 = 88.67%
   Match types: {'exact': 266, 'no_match': 34}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: video_test_action_localization (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_localization: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [04:06<00:00,  4.94s/it]



üìä video_test_action_localization: 104/300 = 34.67%
   Match types: {'exact': 104, 'no_match': 196}
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'd' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'e' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_scene_transition (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_scene_transition: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:35<00:00,  1.91s/it]



üìä video_test_scene_transition: 75/300 = 25.00%
   Match types: {'no_match': 225, 'exact': 75}
   ‚ùå True: 'a' | Pred: 'c' | Raw: 'C' 
   ‚ùå True: 'f' | Pred: 'a' | Raw: 'A' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'a' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_action_count (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_action_count: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [09:34<00:00, 11.48s/it]



üìä video_test_action_count: 101/300 = 33.67%
   Match types: {'exact': 101, 'no_match': 199}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]

Evaluating: video_test_moving_count (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_count: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:04<00:00,  1.29s/it]



üìä video_test_moving_count: 261/300 = 87.00%
   Match types: {'exact': 261, 'no_match': 39}
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

Evaluating: video_test_moving_attribute (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_moving_attribute: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:03<00:00,  1.28s/it]



üìä video_test_moving_attribute: 264/300 = 88.00%
   Match types: {'exact': 264, 'no_match': 36}
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]

Evaluating: video_test_state_change (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_state_change: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:19<00:00, 12.39s/it]



üìä video_test_state_change: 108/300 = 36.00%
   Match types: {'exact': 108, 'no_match': 192}
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'c' | Pred: 'c' | Raw: 'C' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'c' | Pred: 'd' | Raw: 'D' 
   ‚ùå True: 'f' | Pred: 'd' | Raw: 'D' 

Evaluating: video_test_character_order (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_character_order: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [10:23<00:00, 12.48s/it]



üìä video_test_character_order: 96/300 = 32.00%
   Match types: {'no_match': 204, 'exact': 96}
   ‚ùå True: 'b' | Pred: 'e' | Raw: 'E' 
   ‚ùå True: 'b' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'c' | Pred: 'a' | Raw: 'A' 

Evaluating: video_test_egocentric_navigation (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_egocentric_navigation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:57<00:00,  1.16s/it]



üìä video_test_egocentric_navigation: 187/300 = 62.33%
   Match types: {'no_match': 113, 'exact': 187}
   ‚ùå True: 'e' | Pred: 'd' | Raw: 'D' 
   ‚úÖ True: 'e' | Pred: 'e' | Raw: 'E' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]
   ‚úÖ True: 'b' | Pred: 'b' | Raw: 'B' [exact]

Evaluating: video_test_episodic_reasoning (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_episodic_reasoning: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:41<00:00,  2.04s/it]



üìä video_test_episodic_reasoning: 93/300 = 31.00%
   Match types: {'no_match': 207, 'exact': 93}
   ‚ùå True: 'c' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚úÖ True: 'a' | Pred: 'a' | Raw: 'A' [exact]
   ‚ùå True: 'f' | Pred: 'b' | Raw: 'B' 
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 

Evaluating: video_test_counterfactual_inference (300 samples, batch_size=6)
Task type: classification, Numeric tolerance: 0.5


video_test_counterfactual_inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:03<00:00,  1.27s/it]


üìä video_test_counterfactual_inference: 261/300 = 87.00%
   Match types: {'exact': 261, 'no_match': 39}
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚úÖ True: 'd' | Pred: 'd' | Raw: 'D' [exact]
   ‚ùå True: 'd' | Pred: 'e' | Raw: 'E' 
   ‚úÖ True: 'f' | Pred: 'f' | Raw: 'F' [exact]

SUMMARY

üìÅ GLUE_TEST:
   Classification: 3684/4280 = 86.07%
     - glue_test_sst2: 94.84%
     - glue_test_qnli: 94.60%
     - glue_test_qqp: 86.10%
     - glue_test_cola: 69.10%
     - glue_test_mrpc: 87.99%
   Regression: Pearson=0.9196, Spearman=0.9213
     - glue_test_stsb: Pearson=0.9196, Spearman=0.9213

üìÅ IMAGE_TEST:
   Classification: 7019/8476 = 82.81%
     - image_test_chartqa: 75.00%
     - image_test_okvqa: 58.15%
     - image_test_scienceqa: 98.26%
     - image_test_seed_bench: 76.40%
     - image_test_text_recognition: 88.20%
     - image_test_textvqa: 72.10%
     - image_test_vizwiz_vqa: 68.82%
     - image_test_vqa_rad: 83.00%
  


