In [30]:
import os

import cv2
from datasets import DatasetDict, load_dataset
from datasets.combine import concatenate_datasets
import numpy as np
from transformers import AutoImageProcessor, AutoTokenizer

FRAMES_PER_VIDEO = 8

In [2]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [3]:
def preprocess(example):
    video_id = example["videoID"]
    captions = example["enCap"]
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % video_id)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % video_id)
    
    # count number of frames
    video = cv2.VideoCapture(video_path)
    frame_count = 0
    while True:
        ret, _ = video.read()
        if not ret:
            break
        frame_count += 1
    video.release()
        
    # fixed frame sampling
    indices = np.linspace(0, frame_count, num=FRAMES_PER_VIDEO, endpoint=False).astype(np.int64)
    # random frame sampling
    #indices = np.sort(np.random.uniform(low=0, high=frame_count, size=self.num_frames).astype(np.int64))
    
    # get frames
    video = cv2.VideoCapture(video_path)
    frames = []
    frame_count, frame_idx = 0, 0
    while frame_idx < len(indices):
        if frame_count == indices[frame_idx]:
            _, frame = video.read()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            frame_idx += 1
        else:
            video.grab()
        frame_count += 1
    video.release()
        
    # longest caption
    max_len = -np.inf
    caption = None
    for cap in captions:
        length = len(cap.split(" "))
        if length > max_len:
            max_len = length
            caption = cap
    # random caption
    #caption = captions[random.randint(0, 9)]
    
    pixel_values = image_processor(frames, return_tensors="pt").pixel_values
    labels = tokenizer(caption, padding="max_length").input_ids
    return {"pixel_values": pixel_values[0], "labels": labels}

In [4]:
# load json data

data_files = {"train": "dataset/vatex_train_captions.json", "validation": "dataset/vatex_val_captions.json"}
dataset = load_dataset("json", data_files=data_files)

Using custom data configuration default-8e787070c0da13e4
Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-8e787070c0da13e4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# preprocess and save parts

parts_dir = "dataset/parts"
step = 1000

start, end = 0, len(dataset["train"])
idx = 0

while start < end:
    ds = DatasetDict({"train": dataset["train"].select([x for x in range(start, min(start + step, end))])})
    ds = ds.map(function=preprocess, remove_columns=ds["train"].column_names)
    ds.set_format(type="torch")
    ds.save_to_disk(os.path.join(parts_dir, "part%d" % idx))
    
    start += step
    idx += 1
    
start, end = 0, len(dataset["validation"])

while start < end:
    ds = DatasetDict({"validation": dataset["validation"].select([x for x in range(start, min(start + step, end))])})
    ds = ds.map(function=preprocess, remove_columns=ds["validation"].column_names)
    ds.set_format(type="torch")
    ds.save_to_disk(os.path.join(parts_dir, "part%d" % idx))
    
    start += step
    idx += 1

In [24]:
# merge parts and save combined

dataset = DatasetDict({"train": None, "validation": None})

for part in os.listdir(parts_dir):
    ds = load_from_disk(os.path.join(parts_dir, part))
    
    for split in ("train", "validation"):
        if split in ds:
            if dataset[split] is None:
                dataset[split] = ds[split]
            else:
                dataset[split] = concatenate_datasets([dataset[split], ds[split]])

dataset.save_to_disk("dataset/preprocessed")
dataset

DatasetDict({
    train: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 22895
    })
    validation: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 2643
    })
})