In [1]:
import json
import os
import random

import decord
from decord import VideoReader, gpu
import numpy as np
import torch
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoFeatureExtractor, AutoTokenizer

decord.bridge.set_bridge('torch')

In [2]:
class VatexDataset(torch.utils.data.Dataset):
    def __init__(self, videos_path, json_path, num_frames, transforms=None):
        self.video_names = []
        for _, _, files in os.walk(videos_path):
            for file in files:
                self.video_names.append(file)
        
        self.video_captions = {}
        with open(json_path) as json_file:
            json_data = json.load(json_file)            
            for data in json_data:
                video_id = data["videoID"]
                captions = data["enCap"]
                self.video_captions[video_id] = captions
                
        self.videos_path = videos_path
        self.num_frames = num_frames
        self.transforms = transforms
        

    def __len__(self):
        return len(self.video_names) * 10
    
    
    def __getitem__(self, idx):        
        video_idx = idx % len(self.video_names)
        caption_idx = idx // len(self.video_names)
        
        video_name = self.video_names[video_idx]
        video_id = video_name.split(".")[0]
        captions = self.video_captions[video_id]
        caption = captions[caption_idx]
        
        video_path = os.path.join(self.videos_path, video_name)
        videoreader = VideoReader(video_path, ctx=gpu(0))
        videoreader.seek(0)
        
        indices = np.sort(np.random.uniform(low=0, high=len(videoreader), size=self.num_frames).astype(np.int64))
        #indices = np.linspace(0, len(videoreader) - 1, num=self.num_frames).astype(np.int64)
        frames = videoreader.get_batch(indices)

        return (frames, caption)

In [3]:
num_frames = 16
data_train = VatexDataset("dataset/vatex_train_videos", "dataset/vatex_train_captions.json", num_frames)
data_val = VatexDataset("dataset/vatex_val_videos", "dataset/vatex_val_captions.json", num_frames)
print(len(data_train))
print(len(data_val))

229780
26500


In [4]:
# TimeSformer
# encoder = "facebook/timesformer-base-finetuned-k400"
# decoder = "gpt2"

# image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
# tokenizer = AutoTokenizer.from_pretrained(decoder)
# model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder)

# model.num_parameters()

# VideoMAE
encoder = "MCG-NJU/videomae-base"
decoder = "gpt2"

image_processor = AutoImageProcessor.from_pretrained(encoder)
tokenizer = AutoTokenizer.from_pretrained(decoder)
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder, decoder)

model.num_parameters()

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEModel: ['decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.2.attention.output.dense.weight', 'decoder.decoder_layers.2.intermediate.dense.weight', 'decoder.decoder_layers.0.attention.attention.key.weight', 'decoder.norm.weight', 'decoder.decoder_layers.3.attention.output.dense.weight', 'decoder.decoder_layers.2.attention.attention.v_bias', 'decoder.decoder_layers.3.intermediate.dense.weight', 'decoder.decoder_layers.0.layernorm_before.weight', 'decoder.head.bias', 'decoder.decoder_layers.2.layernorm_before.bias', 'decoder.decoder_layers.2.layernorm_after.weight', 'decoder.decoder_layers.1.attention.output.dense.weight', 'decoder.decoder_layers.3.attention.attention.key.weight', 'decoder.decod

239033856

In [12]:
frames, caption = data_train[2]
print(caption)

pixel_values = image_processor(list(frames), return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


A person is outside on a street doing skateboarding tricks next to a shore.

The following is a list of all the best players in the NBA.

The following
