# Data Extraction Notebook
---


## Import Depedencies


In [1]:
import cv2
import mediapipe as mp

import json
import os
import re
import shutil
import pyarrow.parquet as pq
import pyarrow as pa
import tqdm
import sys

import numpy as np
import pandas as pd

## Function Helper


In [2]:
def mp_detection(frame, mp_model):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame.flags.writeable = False
    landmarks = mp_model.process(frame)
    return landmarks

In [3]:
def assign_landmark(landmark, type, num_frame, landmark_id, x, y, z, vis):
    
    for id, cord in enumerate(landmark):
        landmark_id.append(f"{num_frame}-{type}-{id}")
        x.append(cord.x)
        y.append(cord.y)
        z.append(cord.z)
        vis.append(cord.visibility if cord.visibility else 0)
    
    return landmark_id, x, y, z, vis

In [4]:
def assign_empty_landmark(size, type, num_frame, landmark_id, x, y, z, vis):
    
    for id in range(0,size):
        landmark_id.append(f"{num_frame}-{type}-{id}")
        x.append(0)
        y.append(0)
        z.append(0)
        vis.append(0)
    
    return landmark_id, x, y, z, vis

In [15]:
def extract_keypoints(landmarks, num_frame):
    landmark_id = []
    x, y, z, vis = [], [], [], []
    
    #Face
    if landmarks.face_landmarks:
        landmark_id, x, y, z, vis = assign_landmark(landmarks.face_landmarks.landmark, 'face', 
                                                    num_frame, landmark_id, x, y, z, vis)
    else:
        landmark_id, x, y, z, vis = assign_empty_landmark(468, 'face', 
                                                          num_frame, landmark_id, x, y, z, vis)
    
    #Pose     
    if landmarks.pose_landmarks:
        landmark_id, x, y, z, vis = assign_landmark(landmarks.pose_landmarks.landmark, 'pose', 
                                                    num_frame, landmark_id, x, y, z, vis)
    else:
        landmark_id, x, y, z, vis = assign_empty_landmark(33, 'pose', 
                                                          num_frame, landmark_id, x, y, z, vis)

    #Right Hand    
    if landmarks.right_hand_landmarks:
        landmark_id, x, y, z, vis = assign_landmark(landmarks.right_hand_landmarks.landmark, 'right_hand', 
                                                    num_frame, landmark_id, x, y, z, vis)
    else:
        landmark_id, x, y, z, vis = assign_empty_landmark(21, 'right_hand', 
                                                          num_frame, landmark_id, x, y, z, vis)
    #Left Hand
    if landmarks.left_hand_landmarks:
        landmark_id, x, y, z, vis = assign_landmark(landmarks.left_hand_landmarks.landmark, 'left_hand', 
                                                    num_frame, landmark_id, x, y, z, vis) 
    else:
        landmark_id, x, y, z, vis = assign_empty_landmark(21, 'left_hand', 
                                                          num_frame, landmark_id, x, y, z, vis)
    return [landmark_id, x, y, z, vis]

In [16]:
def extract_video(fname, fPATH, tricks, mp_model, save_PATH):  
    list_save_PATH = []
    signs = []
    total_frame = []
     
    with mp_model.Holistic(min_detection_confidence=.5, min_tracking_confidence=.5) as holistic_model:
        
        cap = cv2.VideoCapture(fPATH)
        id = 0
        
        for i in tqdm.trange(len(tricks), desc = f"Extracting \"{fPATH}\"", ncols=150, leave=False):
            landmark_id = []
            x, y, z, vis = [], [], [], []
            
            start_ms = int(tricks[i]['start'] * 1000)
            end_ms = int(tricks[i]['end'] * 1000)
            
                
            cap.set(cv2.CAP_PROP_POS_MSEC, start_ms)
            num_frame = 1
            while True:
                ret, frame = cap.read()
                
                if not ret or cap.get(cv2.CAP_PROP_POS_MSEC) > end_ms:
                    break
                                
                landmarks = mp_detection(frame, holistic_model)
                results = extract_keypoints(landmarks, num_frame)
                                
                landmark_id.extend(results[0])
                x.extend(results[1])
                y.extend(results[2])
                z.extend(results[3])
                vis.extend(results[4])
                                
                num_frame += 1
            
            final_save_PATH = f"{fname.split('.')[0]}_{id}.parquet"                       
            pq.write_table(pa.Table.from_pandas(pd.DataFrame({'landmark_id':np.array(landmark_id).astype(str), 
                                                              'x':np.array(x).astype(float), 
                                                              'y':np.array(y).astype(float), 
                                                              'z':np.array(z).astype(float), 
                                                              'vis':np.array(vis).astype(float)})), 
                           final_save_PATH)
            
            list_save_PATH.append(final_save_PATH)
            total_frame.append(num_frame)
            signs.append(tricks[i]['labels'][0])
            
            id += 1
            
        cap.release()
    
    return list_save_PATH, signs, total_frame

## Extract Data


- Global Variable


In [7]:
mp_holistic = mp.solutions.holistic
DATA_PATH = 'data'
participants = os.listdir(DATA_PATH)

- Extraction Process


In [25]:
save_PATH_DATA = []
signs_DATA = []

for participant in participants:

    with open(f"{DATA_PATH}/{participant}/annotation.json", 'r') as json_file:
        json_data = json_file.read()
    json_file.close()

    save_PATH = f"{DATA_PATH}/{participant}/raw_landmarks"

    try:
        os.mkdir(save_PATH)
    except OSError as err:
        pass 

    for json_list in json.loads(json_data):
        fname = re.sub('_',' ',re.split(r'[/-]', json_list['video_url'])[-1])
        fPATH = f"{DATA_PATH}/{participant}/{fname}"
        
        list_sPATH, signs = extract_video(fname, fPATH, json_list['tricks'], mp_holistic, save_PATH)
        
        save_PATH_DATA.extend(list_sPATH)
        signs_DATA.extend(signs)

                                                                                                                                                      

In [11]:
pd.DataFrame({
    'path':np.array(save_PATH_DATA).astype(str),
    'sign':np.array(signs_DATA).astype(str)
}).to_csv('data_map.csv', index=False)