In [2]:
import json
import multiprocessing
import os

import av
from datasets import load_dataset, load_from_disk
import numpy as np
from transformers import AutoTokenizer, AutoImageProcessor

In [5]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [11]:
dataset = load_dataset("json", data_files={"train": "dataset/captions/vatex_train_captions.json", "validation": "dataset/captions/vatex_val_captions.json"}, num_proc=os.cpu_count()).remove_columns("chCap")
dataset["test"] = load_dataset("json", data_files={"test": "dataset/captions/vatex_test_captions.json"}, num_proc=os.cpu_count())["test"]
dataset

Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-f91978fa8d753f79/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-1f02c66f53760342/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['videoID', 'enCap'],
        num_rows: 22895
    })
    validation: Dataset({
        features: ['videoID', 'enCap'],
        num_rows: 2643
    })
    test: Dataset({
        features: ['videoID', 'enCap'],
        num_rows: 5297
    })
})

In [12]:
# save to file videoID_captions = {"videoID": [captions]}
def make_jsons(example, videoID_captions=None):
    videoID, captions = example["videoID"], example["enCap"]
    videoID_captions[videoID] = captions
    return example

manager = multiprocessing.Manager()
videoID_captions = manager.dict()
dataset.map(make_jsons, fn_kwargs={"videoID_captions": videoID_captions}, num_proc=os.cpu_count())

with open("dataset/videoID_captions.json", "w") as file:
    file.write(json.dumps(videoID_captions.copy()))

Map (num_proc=64):   0%|          | 0/22895 [00:00<?, ? examples/s]

Map (num_proc=64):   0%|          | 0/2643 [00:00<?, ? examples/s]

Map (num_proc=64):   0%|          | 0/5297 [00:00<?, ? examples/s]

In [3]:
# create dataset with features: videoID, pixel_values (8, 3, 224, 224), labels (10, 1024)
def process(example):
    videoID, captions = example["videoID"], example["enCap"]
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % videoID)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % videoID)
    container = av.open(video_path)
    
    # discrepancy between in codec metadata, manually get frame count
    container.seek(0)
    frame_count = 0
    for frame in container.decode(video=0):
        frame_count += 1
    
    indices = set(np.linspace(0, frame_count, num=8, endpoint=False).astype(np.int64))
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))   
            
    pixel_values = image_processor(frames).pixel_values[0]
    labels = tokenizer(captions, padding="max_length").input_ids
    return {"videoID": videoID, "pixel_values": pixel_values, "labels": labels}

In [None]:
dataset = dataset.map(process, remove_columns=["enCap"], num_proc=os.cpu_count())
dataset.save_to_disk("/data1/caelen/dataset/vatex", num_proc=os.cpu_count())