In [1]:
# Load all the necessary modules

import os
import torch
import glob
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import cv2


In [2]:
# Load metadata CSV file
path = "/path/to/your/dataset"
videofile_list = glob.glob(os.path.join(path, "*.mov"))

In [3]:
for video_file_path in videofile_list:
    cap = cv2.VideoCapture(video_file_path)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    ret, prev_frame = cap.read()
    if not ret:
        print(f'Failed to read video {video_file_path}')
        cap.release()
        continue

    prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

    horizontal_motiongram = np.zeros((prev_frame_gray.shape[1], frame_count), dtype=np.float32)  # Swap dimensions
    vertical_motiongram = np.zeros((prev_frame_gray.shape[1], frame_count), dtype=np.float32)  # Swap dimensions and match width of horizontal motiongram

    frame_index = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        diff = cv2.absdiff(prev_frame_gray, frame_gray)
        horizontal_diff = np.sum(diff, axis=0)
        vertical_diff = np.sum(diff, axis=1)

        horizontal_motiongram[:, frame_index] = horizontal_diff  # Swap dimensions
        vertical_motiongram[:len(vertical_diff), frame_index] = vertical_diff  # Swap dimensions and pad the vertical motiongram to match the width

        frame_index += 1
        prev_frame_gray = frame_gray

    cap.release()

    averaged_motiongram = (horizontal_motiongram + vertical_motiongram) / 2

    video_feature = np.concatenate((averaged_motiongram.T, np.full((frame_count, 1), frame_count)), axis=1).T  # Transpose before concatenation

    basename = os.path.splitext(os.path.basename(video_file_path))[0]

    np.save(f"/path/to/your/video/{basename}.npy", video_feature)

In [4]:
db_gesture = "/path/to/your/video/*"
files = glob.glob(db_gesture)
filename = [os.path.basename(item) for item in files]

target = [item.split("_")[-3] for item in files]

label_encoder = LabelEncoder()
target_idx = label_encoder.fit_transform(target) # Change class names to numeric

dict = {'video_npy': filename, 'target': target, 'target_idx': target_idx}
dataset_all = pd.DataFrame(dict)

dataset_all.to_csv('metadata.csv')


In [6]:
X = dataset_all['video_npy']
y = dataset_all['target_idx']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

In [7]:
train_df = pd.DataFrame({'filename': list(X_train), 'target': list(y_train)})
test_df = pd.DataFrame({'filename':list(X_test), 'target': list(y_test)})


train_df.to_csv("/path/to/your/video/train.csv")
test_df.to_csv("/path/to/your/video/test.csv")