In [1]:
import os

import cv2
from datasets import load_dataset
import numpy as np
from transformers import AutoImageProcessor, AutoTokenizer

FRAMES_PER_VIDEO = 8

In [2]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [3]:
def preprocess(example):
    video_id = example["videoID"]
    captions = example["enCap"]
    
    videos_path = "dataset/videos"
    video_path = os.path.join(videos_path, "%s.mp4" % video_id)
    if not os.path.isfile(video_path):
        video_path = os.path.join(videos_path, "%s.webm" % video_id)
    
    # count number of frames
    video = cv2.VideoCapture(video_path)
    frame_count = 0
    while True:
        ret, _ = video.read()
        if not ret:
            break
        frame_count += 1
    video.release()
        
    # fixed frame sampling
    indices = np.linspace(0, frame_count, num=FRAMES_PER_VIDEO, endpoint=False).astype(np.int64)
    # random frame sampling
    #indices = np.sort(np.random.uniform(low=0, high=frame_count, size=self.num_frames).astype(np.int64))
    
    # get frames
    video = cv2.VideoCapture(video_path)
    frames = []
    frame_count, frame_idx = 0, 0
    while frame_idx < len(indices):
        if frame_count == indices[frame_idx]:
            _, frame = video.read()
            
            try:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            except:
                print(video_path)
            
            frames.append(frame)
            frame_idx += 1
        else:
            video.grab()
        frame_count += 1
    video.release()
        
    # longest caption
    max_len = -np.inf
    caption = None
    for cap in captions:
        length = len(cap.split(" "))
        if length > max_len:
            max_len = length
            caption = cap
    # random caption
    #caption = captions[random.randint(0, 9)]
    
    pixel_values = image_processor(frames, return_tensors="pt").pixel_values
    labels = tokenizer(caption, padding="max_length").input_ids
    return {"pixel_values": pixel_values[0], "labels": labels}

In [None]:
data_files = {"train": "dataset/vatex_train_captions.json", "validation": "dataset/vatex_val_captions.json"}
dataset = load_dataset("json", data_files=data_files)

#dataset["train"] = dataset["train"].select([x for x in range(10)])
#dataset["validation"] = dataset["validation"].select([x for x in range(1)])

dataset = dataset.map(function=preprocess, remove_columns=dataset["train"].column_names)
dataset.set_format(type="torch")
dataset.save_to_disk("dataset/preprocessed")
dataset

Using custom data configuration default-8e787070c0da13e4
Found cached dataset json (/home/922201615/.cache/huggingface/datasets/json/default-8e787070c0da13e4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/22896 [00:00<?, ?ex/s]

  return torch.tensor(value)
[h264 @ 0x84e7700] mmco: unref short failure
[h264 @ 0x84e7700] mmco: unref short failure
[h264 @ 0x869df80] mmco: unref short failure
[h264 @ 0x869df80] mmco: unref short failure
[h264 @ 0x8deb500] mmco: unref short failure
[h264 @ 0x8deb500] mmco: unref short failure
[h264 @ 0x87222c0] mmco: unref short failure
[h264 @ 0x87222c0] mmco: unref short failure
[h264 @ 0x81dcd40] mmco: unref short failure
[h264 @ 0x81b5900] mmco: unref short failure
[h264 @ 0x81b5680] mmco: unref short failure
[h264 @ 0x81dd180] mmco: unref short failure
[h264 @ 0x1918fdc0] mmco: unref short failure
[h264 @ 0x81b5680] mmco: unref short failure
[h264 @ 0x1017c3c0] mmco: unref short failure
[h264 @ 0x1918fdc0] mmco: unref short failure
[h264 @ 0x92c9f540] mmco: unref short failure
[h264 @ 0x805da00] mmco: unref short failure
[h264 @ 0xc7ceda80] mmco: unref short failure
[h264 @ 0xc7ceda80] mmco: unref short failure
[h264 @ 0xa9a9740] mmco: unref short failure
[h264 @ 0xa9a9740] m