In [1]:
import os

import cv2
from datasets import DatasetDict, load_dataset
from datasets.combine import concatenate_datasets
from matplotlib import pyplot as plt
import numpy as np
from transformers import AutoTokenizer

FRAMES_PER_VIDEO = 16

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def preprocess(example):
    video_id = example["videoID"]
    captions = example["enCap"]
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % video_id)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % video_id)
    
    # count number of frames
    video = cv2.VideoCapture(video_path)
    frame_count = 0
    while True:
        ret, _ = video.read()
        if not ret:
            break
        frame_count += 1
    video.release()
        
    # fixed frame sampling
    indices = np.linspace(0, frame_count, num=FRAMES_PER_VIDEO, endpoint=False).astype(np.int64)
    # random frame sampling
    #indices = np.sort(np.random.uniform(low=0, high=frame_count, size=self.num_frames).astype(np.int64))
    
    # get frames
    video = cv2.VideoCapture(video_path)
    frames = []
    frame_count, frame_idx = 0, 0
    while frame_idx < len(indices):
        if frame_count == indices[frame_idx]:
            _, frame = video.read()
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            frame_idx += 1
        else:
            video.grab()
        frame_count += 1
    video.release()
        
    # longest caption
    max_len = -np.inf
    caption = None
    for cap in captions:
        length = len(cap.split(" "))
        if length > max_len:
            max_len = length
            caption = cap
    # random caption
    #caption = captions[random.randint(0, 9)]

    labels = tokenizer(caption, padding="max_length").input_ids
    return {"pixel_values": frames, "labels": labels}
    
    # pixel_values = image_processor(frames, return_tensors="pt").pixel_values
    # labels = tokenizer(caption, padding="max_length").input_ids
    # return {"pixel_values": pixel_values[0], "labels": labels}

In [2]:
# load json data
data_files = {"train": "dataset/vatex_train_captions.json", "validation": "dataset/vatex_val_captions.json"}
dataset = load_dataset("json", data_files=data_files)
dataset

# dataset["train"] = dataset["train"].select(np.arange(6))
# dataset["validation"] = dataset["validation"].select(np.arange(3))

# dataset = dataset.map(function=preprocess, remove_columns=["enCap", "chCap"])
# dataset.save_to_disk("dataset/raw_frames_16")

Using custom data configuration default-dc0812067ce11954
Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-dc0812067ce11954/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['videoID', 'enCap', 'chCap'],
        num_rows: 22895
    })
    validation: Dataset({
        features: ['videoID', 'enCap', 'chCap'],
        num_rows: 2643
    })
})

In [None]:
# preprocess and save parts

parts_dir = "dataset/parts"
step = 500

start, end = 0, len(dataset["train"])
idx = 0

while start < end:
    ds = DatasetDict({"train": dataset["train"].select([x for x in range(start, min(start + step, end))])})
    ds = ds.map(function=preprocess, remove_columns=ds["train"].column_names)
    ds.save_to_disk(os.path.join(parts_dir, "part%d" % idx))
    
    start += step
    idx += 1
    
start, end = 0, len(dataset["validation"])

while start < end:
    ds = DatasetDict({"validation": dataset["validation"].select([x for x in range(start, min(start + step, end))])})
    ds = ds.map(function=preprocess, remove_columns=ds["validation"].column_names)
    ds.save_to_disk(os.path.join(parts_dir, "part%d" % idx))
    
    start += step
    idx += 1

  0%|          | 0/500 [00:00<?, ?ex/s]

[h264 @ 0x23495680] mmco: unref short failure
[h264 @ 0x23495680] mmco: unref short failure
[h264 @ 0x23e29cc0] mmco: unref short failure
[h264 @ 0x23e29cc0] mmco: unref short failure


  0%|          | 0/500 [00:00<?, ?ex/s]

[h264 @ 0x92e5b040] mmco: unref short failure
[h264 @ 0x92e5b040] mmco: unref short failure
[h264 @ 0x92e5b040] mmco: unref short failure
[h264 @ 0x92e5b040] mmco: unref short failure
[h264 @ 0x1e223980] mmco: unref short failure
[h264 @ 0x1e223980] mmco: unref short failure
[h264 @ 0x23db8700] mmco: unref short failure
[h264 @ 0x12899500] mmco: unref short failure
[h264 @ 0x23e04c80] mmco: unref short failure
[h264 @ 0x1bab6bc0] mmco: unref short failure
[h264 @ 0x2f140300] mmco: unref short failure
[h264 @ 0x65497800] mmco: unref short failure
[h264 @ 0x476c2500] mmco: unref short failure
[h264 @ 0x91417a40] mmco: unref short failure


In [24]:
# merge parts and save combined

dataset = DatasetDict({"train": None, "validation": None})

for part in os.listdir(parts_dir):
    ds = load_from_disk(os.path.join(parts_dir, part))
    
    for split in ("train", "validation"):
        if split in ds:
            if dataset[split] is None:
                dataset[split] = ds[split]
            else:
                dataset[split] = concatenate_datasets([dataset[split], ds[split]])

dataset.save_to_disk("dataset/preprocessed")
dataset

DatasetDict({
    train: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 22895
    })
    validation: Dataset({
        features: ['pixel_values', 'labels'],
        num_rows: 2643
    })
})