In [1]:
import av
import json
import os

from datasets import Dataset, DatasetDict, load_from_disk
import numpy as np
from transformers import AutoTokenizer, AutoImageProcessor

In [2]:
# store vatex json files as {"videoID" : [captions sorted by word count]}

data_path = "dataset"
for filename in ("vatex_train_captions", "vatex_val_captions"):
    output = {}
    with open(os.path.join(data_path, "%s.json" % filename)) as file:
        data = json.load(file)
        for item in data:
            video_id, captions = item["videoID"], item["enCap"]
            captions.sort(reverse=True, key=lambda cap: len(cap.split()))
            output[video_id] = captions
            
    output = json.dumps(output)
    with open(os.path.join(data_path, "%s_videoID_keys.json" % filename), "w") as file:
        file.write(output)

In [3]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [4]:
dataset = load_from_disk("dataset/processed/k600")
dataset.set_format("torch")
dataset

DatasetDict({
    train: Dataset({
        features: ['videoID', 'pixel_values', 'labels'],
        num_rows: 1016
    })
    validation: Dataset({
        features: ['videoID', 'pixel_values', 'labels'],
        num_rows: 117
    })
})

In [5]:
captions_data, out_data = {}, {}
with open("dataset/train_videoID_captions.json") as file:
    captions_data.update(json.load(file))
with open("dataset/val_videoID_captions.json") as file:
    captions_data.update(json.load(file))

def process(video_id, frames_per_video=8):
    caption = captions_data[video_id][0]
    
    out_data[caption] = video_id
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % video_id)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % video_id)
        
    container = av.open(video_path)
    
    # discrepancy between in codec metadata, manually get frame count
    container.seek(0)
    frame_count = 0
    for frame in container.decode(video=0):
        frame_count += 1
    
    indices = set(np.linspace(0, frame_count, num=frames_per_video, endpoint=False).astype(np.int64))
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
            
    pixel_values = image_processor(frames).pixel_values
    labels = tokenizer(caption, padding="max_length").input_ids
    return (pixel_values[0], labels)

In [6]:
train_data = {"video_id": [], "pixel_values": [], "labels": []}
for item in dataset["train"]:
    video_id = item["videoID"]
    pixel_values, labels = process(video_id, 16)
    train_data["video_id"].append(video_id)
    train_data["pixel_values"].append(pixel_values)
    train_data["labels"].append(labels)
    
val_data = {"video_id": [], "pixel_values": [], "labels": []}
for item in dataset["validation"]:
    video_id = item["videoID"]
    pixel_values, labels = process(video_id)
    val_data["video_id"].append(video_id)
    val_data["pixel_values"].append(pixel_values)
    val_data["labels"].append(labels)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
dataset = DatasetDict({"train": train_dataset, "validation": val_dataset})
dataset.save_to_disk("dataset/processed/k600_16frames")
dataset

mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure
mmco: unref short failure


Saving the dataset (0/20 shards):   0%|          | 0/1016 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/117 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['video_id', 'pixel_values', 'labels'],
        num_rows: 1016
    })
    validation: Dataset({
        features: ['video_id', 'pixel_values', 'labels'],
        num_rows: 117
    })
})

In [7]:
with open("dataset/videoID_captions.json", "w") as file:
    file.write(json.dumps(captions_data))
    
with open("dataset/longestCaption_videoID.json", "w") as file:
    file.write(json.dumps(out_data))