In [3]:
import json
import multiprocessing
import os

import av
from datasets import load_dataset
import numpy as np
from transformers import AutoTokenizer, AutoImageProcessor

Load Captions

In [None]:
dataset = load_dataset("json", data_files={"train": "../dataset/vatex_train.json", "validation": "../dataset/vatex_val.json"}, num_proc=os.cpu_count()).remove_columns("chCap")
dataset["test"] = load_dataset("json", data_files={"test": "../dataset/vatex_test.json"}, num_proc=os.cpu_count())["test"]
dataset

Save {"videoID": [captions]} Map

In [None]:
def make_jsons(example, videoID_captions=None):
    videoID, captions = example["videoID"], example["enCap"]
    videoID_captions[videoID] = captions
    return example

manager = multiprocessing.Manager()
videoID_captions = manager.dict()
dataset.map(make_jsons, fn_kwargs={"videoID_captions": videoID_captions}, num_proc=os.cpu_count())

with open("../dataset/videoID_captions.json", "w") as file:
    file.write(json.dumps(videoID_captions.copy()))

Tokenize Data and Save as HF Dataset

In [5]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# create dataset with features: videoID, pixel_values (8, 3, 224, 224), labels (10, 1024)
def process(example):
    videoID, captions = example["videoID"], example["enCap"]
    
    videos_path = "../dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % videoID)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % videoID)
    container = av.open(video_path)
    
    # discrepancy between in codec metadata, manually get frame count
    container.seek(0)
    frame_count = 0
    for frame in container.decode(video=0):
        frame_count += 1
    
    indices = set(np.linspace(0, frame_count, num=8, endpoint=False).astype(np.int64))
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))   
            
    pixel_values = image_processor(frames).pixel_values[0]
    labels = tokenizer(captions, padding="max_length").input_ids
    return {"videoID": videoID, "pixel_values": pixel_values, "labels": labels}

dataset = dataset.map(process, remove_columns=["enCap"], num_proc=os.cpu_count())
dataset.save_to_disk("../dataset/processed/vatex", num_proc=os.cpu_count())