In [1]:
import argparse
from pathlib import Path

import cv2
import h5py
import numpy as np
from tqdm import tqdm
from transformers import ViTFeatureExtractor

In [2]:
def extract_video_features(extractor, video_file):

    vc = cv2.VideoCapture(str(video_file))
    fps = int(vc.get(cv2.CAP_PROP_FPS))
    frames = []
    last_collected = -1
    while vc.isOpened():

        success, frame = vc.read()
        if not success:
            break

        timestmap = vc.get(cv2.CAP_PROP_POS_MSEC)
        second = timestmap // 1000
        if second != last_collected:
            last_collected = second
            frames.append(frame)

    features = extractor(images=frames, return_tensors="pt")
    return features["pixel_values"]

In [3]:
args = argparse.Namespace(
    data_directory = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01",
    out = "/workspace/EmotionShortForm/data_AIHub/2.Validation/Video_data/VS_유튜브_01.h5"
)


In [4]:
video_files = tqdm(list(Path(args.data_directory).glob("**/*.mp4")))
extractor = ViTFeatureExtractor.from_pretrained(
    "google/vit-base-patch16-224", size=224
)

with h5py.File(args.out, "w") as wf:

    for video_file in video_files:
        name = video_file.stem
        try:
            features = extract_video_features(
                extractor, video_file
            )
            wf.create_dataset(name, data=features)
        except Exception as e:
            print(e)

100%|██████████| 8/8 [04:09<00:00, 31.17s/it]
