In [2]:
from torcheval.metrics.text import BLEUScore, Perplexity, WordErrorRate, WordInformationLost, WordInformationPreserved
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Subset
import os

2025-03-22 14:31:55.555239: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data_dir = '/data2/juve/dataset/youdescribe/npz_datasets/YD3_8_frames/'
test_data_dir = os.path.join(data_dir, 'test')
device = "cuda"

pretrained_model = '/home/922201615/caelen/training/vatex/checkpoint_20/'
model = VisionEncoderDecoderModel.from_pretrained(pretrained_model).to(device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

Config of the encoder: <class 'transformers.models.timesformer.modeling_timesformer.TimesformerModel'> is overwritten by shared encoder config: TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-k600",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "abseiling",
    "1": "acting in play",
    "2": "adjusting glasses",
    "3": "air drumming",
    "4": "alligator wrestling",
    "5": "answering questions",
    "6": "applauding",
    "7": "applying cream",
    "8": "archaeological excavation",
    "9": "archery",
    "10": "arguing",
    "11": "arm wrestling",
    "12": "arranging flowers",
    "13": "assembling bicycle",
    "14": "assembling computer",
    "15": "attending conference",
    "16": "auctioning",
    "17": "backflip (human)",
   

In [None]:
class NPZDataset(Dataset):
    def __init__(self, data_dir, num_captions):
        self.data_dir = data_dir
        self.file_names = os.listdir(data_dir)
        self.total_captions = len(self.file_names) * num_captions
        self.num_caption = num_captions

    def __len__(self):
        return self.total_captions

    def __getitem__(self, idx):
        filename_index = idx // self.num_caption
        labels_offset = idx % self.num_caption  
    
        file_path = os.path.join(self.data_dir, self.file_names[filename_index])
        data = np.load(file_path)

        # Each .npz file contains 'arr_o' and 'arr_1', images and captions
        sample = {'filenames': self.file_names[filename_index],
                  'pixel_values': torch.from_numpy(data['arr_0']), 
                  'labels': torch.from_numpy(data['arr_1'][labels_offset])}
        return sample

In [36]:
batch_size = 4

In [41]:
test_dataset_full = NPZDataset(test_data_dir, 11)
test_dataloader_full = DataLoader(test_dataset_full, batch_size=batch_size, shuffle=True)
test_subset_indices = range(0, int(len(test_dataloader_full) * 0.01))
test_subset = Subset(test_dataset_full, test_subset_indices)
test_dataloader = DataLoader(test_subset, batch_size=batch_size)

In [42]:
len(test_dataloader)

92

In [43]:
model.eval()

VisionEncoderDecoderModel(
  (encoder): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense): Linear(in

In [44]:
total_test_loss = 0
predicted_captions = []
predicted_tokens = []
ground_truth_captions = []
ground_truth_tokens = []
all_filenames = []

bleu1_metric = BLEUScore(n_gram=1)
bleu2_metric = BLEUScore(n_gram=2)
bleu3_metric = BLEUScore(n_gram=3)
bleu4_metric = BLEUScore(n_gram=4)
perplexity_metric = Perplexity().to(device)
word_error_rate_metric = WordErrorRate()
word_info_lost_metric = WordInformationLost()
word_info_preserved_metric = WordInformationPreserved()
cider_metric = Cider()
meteor_metric = Meteor()
rouge_metric = Rouge()
spice_metric = Spice()

gen_kwargs = {
    "min_length": 50,
    "max_length": 500,
    "num_beams": 4,
    "no_repeat_ngram_size": 3,
}

for batch in reversed(list(test_dataloader)):
    # batch = {k: v.to(device) for k, v in batch.items()}
    inputs = {}
    for idx, values in batch.items():
        if idx in ['pixel_values', 'labels']:
            inputs[idx] = values.to(device)

    with torch.autocast(device_type='cuda', dtype=torch.float16):        
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs.loss
        total_test_loss += loss.item()
        
        perplexity_metric.update(outputs.logits, inputs['labels'])
        tokens = model.generate(**inputs, **gen_kwargs)
        predicted_tokens.extend(tokens)
        
        decoded_predicted_caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)
        predicted_captions.extend(decoded_predicted_caption)
        
        ground_truth_caption = inputs['labels'].squeeze()
        ground_truth_tokens.extend(ground_truth_caption)
        
        decoded_ground_truth_caption = tokenizer.batch_decode(ground_truth_caption, skip_special_tokens=True)
        ground_truth_captions.extend(decoded_ground_truth_caption)
        if inputs['labels'].shape[0] < batch_size:
            print(inputs["labels"].shape)
        print(f"filenames: {batch['filenames']}")
        print(f"inputs['labels'].shape: {inputs['labels'].shape}")
        print(f"(inputs['labels'].squeeze()).shape: {(inputs['labels'].squeeze()).shape}")
        print(f"decoded_ground_truth_caption: {decoded_ground_truth_caption}")
        all_filenames.extend(batch['filenames'])

print("DEBUG ground_truth_captions:", ground_truth_captions)
print("DEBUG predicted_captions:", predicted_captions)
metrics_dict = {}       
metrics_dict["avg_test_loss"] = total_test_loss / len(test_dataloader)

ground_truth_captions_flattened = [[x] for x in ground_truth_captions]
predicted_captions_flattened = [[x] for x in predicted_captions]
ground_truth_captions_dict = dict(zip(all_filenames, ground_truth_captions_flattened))
predicted_captions_dict = dict((zip(all_filenames, predicted_captions_flattened)))

metrics_dict["blue1_score"] = bleu1_metric.update(predicted_captions_flattened, ground_truth_captions_flattened).compute().item()
metrics_dict["blue2_score"] = bleu2_metric.update(predicted_captions_flattened, ground_truth_captions_flattened).compute().item()
metrics_dict["blue3_score"] = bleu3_metric.update(predicted_captions_flattened, ground_truth_captions_flattened).compute().item()
metrics_dict["blue4_score"] = bleu4_metric.update(predicted_captions_flattened, ground_truth_captions_flattened).compute().item()
metrics_dict["perplexity_score"] = perplexity_metric.compute().item()
metrics_dict["word_error_rate_score"] = word_error_rate_metric.update(predicted_captions, ground_truth_captions).compute().item()
metrics_dict["word_info_lost_score"] = word_info_lost_metric.update(predicted_captions, ground_truth_captions).compute().item()
metrics_dict["word_info_preserved_score"] = word_info_preserved_metric.update(predicted_captions, ground_truth_captions).compute().item()

metrics_dict["cider_score"], _ = Cider().compute_score(ground_truth_captions_dict, predicted_captions_dict)
metrics_dict["meteor_score"], _ = Meteor().compute_score(ground_truth_captions_dict, predicted_captions_dict)
metrics_dict["rouge_score"], _ = Rouge().compute_score(ground_truth_captions_dict, predicted_captions_dict)
metrics_dict["spice_score"], metrics_dict['spice_scores'] = Spice().compute_score(ground_truth_captions_dict, predicted_captions_dict)

print(metrics_dict)

torch.Size([1, 1024])
filenames: ['aJoaGdfERo4_9809_19809_67002da3ef7ae40028c6616d.npz']
inputs['labels'].shape: torch.Size([1, 1024])
(inputs['labels'].squeeze()).shape: torch.Size([1024])
decoded_ground_truth_caption: ['A', ' pair', ' of', ' hands', ' arr', 'anges', ' several', ' small', ' pink', ' heart', '-', 'shaped', ' objects', ' on', ' a', ' white', ' surface', ',', ' then', ' picks', ' up', ' and', ' examines', ' a', ' rectangle', ' of', ' red', ' paper', ',', ' considering', ' placement', '.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',

KeyboardInterrupt: 

In [34]:
ground_truth_captions

['Ethnic map of the Middle East showing geographical location of Arab, Turk, Persian, Kurd, and Azeri.',
 'A man speaks energetically about the ethnic composition of the Middle East. The video intersperses shots of him with a map displaying the geographical distribution of Arabs, Turks, Persians, Kurds, and Azeris.',
 'The video alternates between a speaker explaining and a map showing the ethnic territories of Arabs, Turks, Persians, Kurds, and Azeris in the Middle East.',
 'An animated presenter passionately discusses the distribution of different ethnic groups in the Middle East, illustrated by a map with color-coded regions for Arabs, Turks, Persians, Kurds, and Azeris.',
 'at TP Pratt.',
 'A man stands in front of a brick wall, reading a tweet aloud. The text on the screen humorously suggests sending lifehacks to help him be a good president. He appears amused and then thoughtful.',
 "A man in a suit is standing against a brick wall, reading a tweet. The tweet humorously suggests 

In [35]:
ground_truth_captions_flattened

[['Ethnic map of the Middle East showing geographical location of Arab, Turk, Persian, Kurd, and Azeri.'],
 ['A man speaks energetically about the ethnic composition of the Middle East. The video intersperses shots of him with a map displaying the geographical distribution of Arabs, Turks, Persians, Kurds, and Azeris.'],
 ['The video alternates between a speaker explaining and a map showing the ethnic territories of Arabs, Turks, Persians, Kurds, and Azeris in the Middle East.'],
 ['An animated presenter passionately discusses the distribution of different ethnic groups in the Middle East, illustrated by a map with color-coded regions for Arabs, Turks, Persians, Kurds, and Azeris.'],
 ['at TP Pratt.'],
 ['A man stands in front of a brick wall, reading a tweet aloud. The text on the screen humorously suggests sending lifehacks to help him be a good president. He appears amused and then thoughtful.'],
 ["A man in a suit is standing against a brick wall, reading a tweet. The tweet humorou

In [15]:
predicted_captions_flattened

[['A person is peeling an apple with a knife and using a potato peeler to peel the skin off of the apple with the peeler on the other side of the knife. The peeler is then used to peel a piece of apple.'],
 ['A person is peeling an apple with a knife and using a potato peeler to peel the skin off of the apple with the peeler on the other side of the knife. The peeler is then used to peel a piece of apple.'],
 ['A person is peeling an apple with a knife and using a potato peeler to peel the skin off of the apple with the peeler on the other side of the knife. The peeler is then used to peel a piece of apple.'],
 ['A person is peeling an apple with a knife and using a potato peeler to peel the skin off of the apple with the peeler on the other side of the knife. The peeler is then used to peel a piece of apple.'],
 ['A person is peeling an apple with a knife and using a potato peeler to peel the skin off of the apple with the peeler on the other side of the knife. The peeler is then used

In [None]:
metrics_dict

{'avg_test_loss': 0.11938322252697414,
 'blue1_score': 0.15477560414269276,
 'blue2_score': 0.06467707813660875,
 'blue3_score': 0.024697839526043132,
 'blue4_score': 0.009804994585602172,
 'perplexity_score': 1.1268012045165328,
 'word_error_rate_score': 1.685441017150879,
 'word_info_lost_score': 0.9858730704181948,
 'word_info_preserved_score': 0.014126929581805218,
 'cider_score': 0.015878108193094063,
 'meteor_score': 0.1114150270566105,
 'rouge_score': 0.1664737325360634,
 'spice_score': 0.10433947772657451,
 'spice_scores': [{'All': {'pr': 0.21428571428571427,
    're': 0.17647058823529413,
    'f': 0.1935483870967742,
    'fn': 14.0,
    'numImages': 1.0,
    'fp': 11.0,
    'tp': 3.0},
   'Relation': {'pr': 0.0,
    're': 0.0,
    'f': 0.0,
    'fn': 7.0,
    'numImages': 1.0,
    'fp': 6.0,
    'tp': 0.0},
   'Cardinality': {'pr': nan,
    're': nan,
    'f': nan,
    'fn': 0.0,
    'numImages': 1.0,
    'fp': 0.0,
    'tp': 0.0},
   'Attribute': {'pr': 0.0,
    're': 0.0,
  

In [None]:
print(outputs.logits)

tensor([[[-3.1500e+01, -2.6766e+01, -3.6719e+01,  ..., -4.1969e+01,
          -4.5188e+01, -2.7594e+01],
         [-1.1594e+01, -8.3750e+00, -1.5023e+01,  ..., -1.2242e+01,
          -1.5031e+01, -6.5195e+00],
         [-2.5957e+00, -3.3086e+00, -1.0531e+01,  ..., -1.2938e+01,
          -1.2234e+01, -3.4746e+00],
         ...,
         [-5.4180e+00, -1.1398e+01, -1.1742e+01,  ..., -2.2344e+01,
          -1.6641e+01,  2.3047e+01],
         [-5.4375e+00, -1.1445e+01, -1.1711e+01,  ..., -2.2328e+01,
          -1.6625e+01,  2.3094e+01],
         [-7.8398e+00, -1.4406e+01, -1.4555e+01,  ..., -2.5406e+01,
          -2.0078e+01,  2.0453e+01]],

        [[-3.0406e+01, -2.5062e+01, -3.5250e+01,  ..., -4.1188e+01,
          -4.3875e+01, -2.6438e+01],
         [-1.4375e+01, -1.1297e+01, -1.8062e+01,  ..., -2.2750e+01,
          -1.9469e+01, -1.0273e+01],
         [ 4.9062e+00,  6.1953e+00, -2.9312e-02,  ..., -3.0176e-01,
           1.3159e-01,  7.8906e+00],
         ...,
         [-8.1938e+01, -8

In [None]:
F.log_softmax(outputs.logits, dim=-1)

tensor([[[-14.9844, -10.2500, -20.2031,  ..., -25.4531, -28.6719, -11.0781],
         [-18.0625, -14.8438, -21.4844,  ..., -18.7031, -21.5000, -12.9844],
         [-10.0469, -10.7578, -17.9844,  ..., -20.3906, -19.6875, -10.9219],
         ...,
         [-28.4688, -34.4375, -34.7812,  ..., -45.3750, -39.6875,   0.0000],
         [-28.5312, -34.5312, -34.8125,  ..., -45.4375, -39.7188,   0.0000],
         [-28.2969, -34.8750, -35.0000,  ..., -45.8750, -40.5312,   0.0000]],

        [[-16.3438, -11.0078, -21.1875,  ..., -27.1250, -29.8125, -12.3828],
         [-16.4062, -13.3203, -20.0938,  ..., -24.7812, -21.5000, -12.2969],
         [-11.6719, -10.3828, -16.6094,  ..., -16.8750, -16.4531,  -8.6875],
         ...,
         [-39.7812, -38.7188, -50.2188,  ..., -67.2500, -63.1562,   0.0000],
         [-39.8438, -38.7188, -50.2188,  ..., -67.2500, -63.1562,   0.0000],
         [-39.4375, -38.0000, -50.3125,  ..., -67.2500, -63.0625,   0.0000]],

        [[-16.3438, -11.0078, -21.1875,  ...

.sum(dim=-1): sum along last dimension (Vocabulary)

In [None]:
outputs.logits.shape

torch.Size([4, 1024, 50257])

In [None]:
outputs.logits.sum(dim=-1)

tensor([[  -inf,   -inf,   -inf,  ...,   -inf,   -inf,   -inf],
        [  -inf,   -inf, -5884.,  ...,   -inf,   -inf,   -inf],
        [  -inf,   -inf,   -inf,  ...,   -inf,   -inf,   -inf],
        [  -inf,   -inf,   -inf,  ...,   -inf,   -inf,   -inf]],
       device='cuda:0', dtype=torch.float16)

In [None]:
F.softmax(outputs.logits, dim=-1).sum(dim=-1)

tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [0.9995, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [0.9995, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [0.9995, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       device='cuda:0', dtype=torch.float16)