In [6]:
# Load all the necessary modules

import os
import torch
import glob
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import cv2


In [5]:
# Load metadata CSV file
path = "/itf-fi-ml/shared/users/annammc/gestures/video"
# audiofile_list = glob.glob(os.path.join(path, "*.wav"))
videofile_list = glob.glob(os.path.join(path, "*.mov"))
# midifile_list = glob.glob(os.path.join(path, "*.mid"))
# mocapcsv_list = glob.glob(os.path.join(path, "*.csv"))

In [11]:
for video_file_path in videofile_list:
    cap = cv2.VideoCapture(video_file_path)

    # Read first frame
    ret, prev_frame = cap.read()
    if not ret:
        print(f'Failed to read video {video_file_path}')
        cap.release()
        continue

    # Convert to grayscale
    prev_frame_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)

    # Initialize vertical and horizontal motiongrams
    vertical_motiongram = np.zeros_like(prev_frame_gray, dtype=np.float32)
    horizontal_motiongram = np.zeros_like(prev_frame_gray, dtype=np.float32)

    # Process the video
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        diff = cv2.absdiff(prev_frame_gray, frame_gray)
        horizontal_motiongram = np.vstack((horizontal_motiongram[1:], np.sum(diff, axis=0)))
        vertical_motiongram = np.hstack((vertical_motiongram[:, 1:], np.sum(diff, axis=1).reshape(-1, 1)))
        prev_frame_gray = frame_gray

    cap.release()

    # Stack the motiongrams along a new axis to create a "3D" array
    video_feature = np.stack((vertical_motiongram, horizontal_motiongram), axis=-1)

    # Get the basename of the video file
    basename = os.path.splitext(os.path.basename(video_file_path))[0]

    # Save the stacked feature array into a .npy file
    np.save(f"/itf-fi-ml/shared/users/annammc/Anna/video/{basename}.npy", video_feature)

In [12]:
db_gesture = "/itf-fi-ml/shared/users/annammc/Anna/video/*"
files = glob.glob(db_gesture)
filename = [os.path.basename(item) for item in files]

target = [item.split("_")[-3] for item in files]

label_encoder = LabelEncoder()
target_idx = label_encoder.fit_transform(target) # Change class names to numeric

dict = {'video_npy': filename, 'target': target, 'target_idx': target_idx}
dataset_all = pd.DataFrame(dict)

dataset_all.to_csv('metadata.csv')


In [14]:
X = dataset_all['video_npy']
y = dataset_all['target_idx']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

In [None]:
train_df = pd.DataFrame({'filename': list(X_train), 'target': list(y_train)})
test_df = pd.DataFrame({'filename':list(X_test), 'target': list(y_test)})

train_df.to_csv("/itf-fi-ml/shared/users/annammc/Anna/video/train.csv") 
test_df.to_csv("/itf-fi-ml/shared/users/annammc/Anna/video/test.csv")