# Final Project: ASL Recognition
### Professor: Weizhe Li
### Student: Levan Sulimanov

# Data Collection Script:

# 0. Install and Import Dependencies:

In [5]:
# conda install -c conda-forge ffmpeg-python

In [1]:
!pip install opencv-python
!pip install mediapipe
!pip install pandas openpyxl
!pip install Pillow
!pip install imutils
!pip install keyboard

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [1]:
import os
import cv2
import mediapipe as mp
import pandas as pd
import urllib.request  # for downloading videos
import traceback
import numpy as np
from PIL import Image, ImageOps
import imutils
import random
import keyboard  # using module keyboard
import time
import csv

# 1. Make Detections from Feed:
1. Detect Hand Poses
2. Detect Body Poses
3. Collect only:
    - Wrists, fingers
    - Shoulders
    - Elbows

In [6]:
# test keypoints detection accuracy using Webcam and check if it works:
def web_demo():
    
    # setup media pipe's holistic model for keypoints detections
    # mp_drawing = mp.solutions.drawing_utils  # we are not using this one, since we are annotating detected keypoints using OpenCV drawing module
    mp_holistic = mp.solutions.holistic
    
    print(f"mp.solutions: {mp.solutions}")

    # setup webcam:
    cap = cv2.VideoCapture(0)

    # Check if the video is opened correctly
    if not cap.isOpened():
        raise IOError("Cannot open webcam")
    # if success value will switch off after reading cap(), we will turn off OpenCV right away
    success = True
    try:
        _, frame = cap.read()
        # WARNING <<< IF YOU WILL CHANGE SHAPE OF IMAGE (FOR SPEED), do it here too
        height, width = frame.shape[0], frame.shape[1]
    except:
        print("Failed to read first frame")
        success = False
        cap.release()
        cv2.destroyAllWindows()

    # if we are good at reading from video source, then let's run holistic model and input incoming frame into it to get keypoints 
    if success:
        with mp_holistic.Holistic(min_detection_confidence=0.5,
                                  min_tracking_confidence=0.5,
                                  static_image_mode=False) as holistic:
            while cap.isOpened():
                ret, frame = cap.read()

                # recolor feed:
                image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # make detections:
                # once detection are made, let's verify if hands, elbows, and shoulders were detected.
                results = holistic.process(image)


                main_coordinates = []

                # Just for drwaing:
                '''
                # drawing Right Hand
                mp_drawing.draw_landmarks(image,
                                          results.right_hand_landmarks,
                                          mp_holistic.HAND_CONNECTIONS)

                # drawing Left Hand
                mp_drawing.draw_landmarks(image,
                                          results.left_hand_landmarks,
                                          mp_holistic.HAND_CONNECTIONS)

                # drawing shoulders and elbows
                mp_drawing.draw_landmarks(image,
                                          results.pose_landmarks,
                                          mp_holistic.POSE_CONNECTIONS)
                '''                

                ################################################################################
                # get coordinates:
                # hand points:
                if results.right_hand_landmarks:
                    for r_h in results.right_hand_landmarks.landmark:
                        main_coordinates.append((r_h.x, r_h.y))
                # if not detected, fill in out of bounds coordinates -> think it will help to have consistent error model to know that we can have that.
                else:
                    main_coordinates.append((2.0,2.0))
                
                # left hand points:
                if results.left_hand_landmarks:
                    for l_h in results.left_hand_landmarks.landmark:
                        main_coordinates.append((l_h.x, l_h.y))
                # if not detected, fill in out of bounds coordinates -> think it will help to have consistent error model to know that we can have that.
                else:
                    main_coordinates.append((2.0, 2.0))
                
                # get elbows and shoulders from detected pose:
                if results.pose_landmarks:
                    # top torse keypoints:
                    r_elbow = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_ELBOW]
                    main_coordinates.append((r_elbow.x, r_elbow.y))

                    l_elbow = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_ELBOW]
                    main_coordinates.append((l_elbow.x, l_elbow.y))

                    r_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER]
                    main_coordinates.append((r_shoulder.x, r_shoulder.y))

                    l_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER]
                    main_coordinates.append((l_shoulder.x, l_shoulder.y))
                else:
                    main_coordinates.append((2.0, 2.0))
                    main_coordinates.append((2.0, 2.0))
                    main_coordinates.append((2.0, 2.0))
                    main_coordinates.append((2.0, 2.0))
                
                # draw them on top of the given frame:
                for lm in main_coordinates:
                    cx, cy = int(lm[0]*width), int(lm[1]*height)
                    cv2.circle(image, (cx, cy), 3, (0, 0, 255), cv2.FILLED)
                ################################################################################
                # visualize the output:
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                cv2.imshow('Raw Webcam Feed', image)

                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

        cap.release()
        cv2.destroyAllWindows()

In [7]:
web_demo()

mp.solutions: <module 'mediapipe.python.solutions' from 'C:\\Users\\lrspr\\AppData\\Roaming\\Python\\Python310\\site-packages\\mediapipe\\python\\solutions\\__init__.py'>


# 2. Data Collection:

In [2]:
import os
import cv2
import mediapipe as mp
import pandas as pd
import urllib.request  # for downloading videos
import traceback
import numpy as np
from PIL import Image, ImageOps
import imutils
import random
import ffmpeg
import keyboard  # using module keyboard
import time
import csv


MAIN_WIDTH = 640  # 256, iphone(480), 
MAIN_HEIGHT = 480  # 192, iphone(320), 
MAX_NUM_OF_XY_KEYPOINTS_LIST = 46

# utility function to quickly create non-existing function:
def mkdir_if_none(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

    
# if something errored out, return pre-defined array:
BACKUP_ARRAY = np.array([2.0, 2.0] * MAX_NUM_OF_XY_KEYPOINTS_LIST)


# resize the frame to specified size
def resize_with_padding(img, expected_size):
    delta_width = expected_size[0] - img.size[0]
    delta_height = expected_size[1] - img.size[1]
    pad_width = delta_width // 2
    pad_height = delta_height // 2
    padding = (pad_width, pad_height, delta_width - pad_width, delta_height - pad_height)
    return ImageOps.expand(img, padding)



# just get keypoints from given frame:
def get_keypoints_from_frame(frame, holistic, mp_holistic, hand_keypoint_default=21, verbose=True):
    
    try:
    
        # if it came from OpenCV, we need to switch channel order:
        frame = imutils.resize(frame, width=MAIN_WIDTH)
        pil = Image.fromarray(frame)
        frame = cv2.cvtColor(np.array(resize_with_padding(pil, (MAIN_WIDTH, MAIN_HEIGHT))), cv2.COLOR_RGB2BGR)

        # make detections:
        results = holistic.process(frame)
        
        # array for collecting frame coordinates from hands, elbows, and shoulders [PER GIVEN SINGLE FRAME]:
        frame_coordinates = []
            
        ################################################################################
        # get coordinates:
        # hand points:
        # RIGHT:
        if results.right_hand_landmarks:
            for r_h in results.right_hand_landmarks.landmark:
                frame_coordinates.append(r_h.x)
                frame_coordinates.append(r_h.y)
        # if not detected, fill in out of bounds coordinates -> think it will help to have consistent error model to know that we can have that.
        else:
            for r_h in range(21):
                frame_coordinates.append(2.0)
                frame_coordinates.append(2.0)

        # LEFT:
        if results.left_hand_landmarks:
            for l_h in results.left_hand_landmarks.landmark:
                frame_coordinates.append(l_h.x)
                frame_coordinates.append(l_h.y)
        else:
            for l_h in range(21):
                frame_coordinates.append(2.0)
                frame_coordinates.append(2.0)

        # SHOULDERS AND ELBOWS:
        if results.pose_landmarks:
            # top torse keypoints:
            r_elbow = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_ELBOW]
            frame_coordinates.append(r_elbow.x)
            frame_coordinates.append(r_elbow.y)

            l_elbow = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_ELBOW]
            frame_coordinates.append(l_elbow.x)
            frame_coordinates.append(l_elbow.y)

            r_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.RIGHT_SHOULDER]
            frame_coordinates.append(r_shoulder.x)
            frame_coordinates.append(r_shoulder.y)

            l_shoulder = results.pose_landmarks.landmark[mp_holistic.PoseLandmark.LEFT_SHOULDER]
            frame_coordinates.append(l_shoulder.x)
            frame_coordinates.append(l_shoulder.y)
        else:
            frame_coordinates.append(2.0)  # fake x axis for r_elbow
            frame_coordinates.append(2.0)  # fake y axis for r_elbow
            
            frame_coordinates.append(2.0)  # fake x axis for l_elbow
            frame_coordinates.append(2.0)  # fake y axis for l_elbow
            
            frame_coordinates.append(2.0)  # fake x axis for r_shoulder
            frame_coordinates.append(2.0)  # fake y axis for r_shoulder
            
            frame_coordinates.append(2.0)  # fake x axis for l_shoulder
            frame_coordinates.append(2.0)  # fake y axis for l_shoulder

        if verbose:
            for coord_idx in range(0, len(frame_coordinates), 2):
                lm = frame_coordinates[coord_idx], frame_coordinates[coord_idx+1]
                cx, cy = int(lm[0]*frame.shape[1]), int(lm[1]*frame.shape[0])
                cv2.circle(frame, (cx, cy), 2, (255, 0, 0), cv2.FILLED)
        ################################################################################
        # return np.array(frame_coordinates), frame
        return frame_coordinates, frame
    except:
        print("<<<ERROR IN GETTING KEYPOINTS>>>")
        print(traceback.format_exc())
        print("---")
        return BACKUP_ARRAY, frame
    


# run video and process videos through media pipe:
def process_video(holistic, mp_holistic, keypoints_data_dir, action_w_source_lst, num_of_sequences_per_class, sequence_length, fps=30):
    
    # mkdir for main keypoints directory:
    mkdir_if_none(keypoints_data_dir)
    
    print(f"Action order: {[i[0] for i in action_w_source_lst]}")
    
    # go over each action (class):
    for action, video_source in action_w_source_lst:
        
        try:
            # initiate class, if no such exists:
            class_dir_path = os.path.join(keypoints_data_dir, action)
            mkdir_if_none(class_dir_path)

            want_to_quit_action = False

            #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
            video_source = video_source
            cap = cv2.VideoCapture(video_source)
            cap.set(cv2.CAP_PROP_FPS, fps)

            # Check if the video is opened correctly
            if not cap.isOpened():
                raise IOError(f"Cannot open video {video_source}")
                cap.release()
                cv2.destroyAllWindows()
                # return np.array([])
                print(f"<<<WARNING: Failed to open video source = {video_source}")

            print(f"Processing video: {video_source} with {fps}FPS")
            time.sleep(1)
            #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

            # keep count of sample per class:
            sequence_counter = 0
            # for each sequence, keep storing it until getting them all (30 recs per each):
            for sequence in range(num_of_sequences_per_class):
                # get sequence array, to get them all in single place:
                frame_coordinates_stack = []
                save_file_as = os.path.join(class_dir_path, f"{sequence_counter}.csv")  # npy")

                if os.path.exists(save_file_as):
                    print(f"Such file ({save_file_as}) already exist. Skipping to next sample...")
                    sequence_counter+=1
                    continue

                #################################################################
                # go over each frame and get it's associated keypoints:
                for frame_num in range(sequence_length):  # total range per video (30FPS per second is default)

                    try:
                        ret, frame = cap.read()
                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                        if ret:
                            retrieved_frame_keypoints, frame = get_keypoints_from_frame(frame, holistic, mp_holistic)
                            # print(f"len(retrieved_frame_keypoints): {len(retrieved_frame_keypoints)}")
                            frame_coordinates_stack.append(retrieved_frame_keypoints)

                            cv2.imwrite(os.path.join(os.getcwd(), "tmp", f"{action}_{frame_num}.jpg"), frame)

                            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                            if frame_num == 0:
                                cv2.putText(frame, f"STARTING COLLECTION for {action}", (120,200),
                                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4, cv2.LINE_AA)
                                cv2.putText(frame, f"Collecting frames for {action} Video Number {sequence}/{num_of_sequences_per_class}", (15,22),
                                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1, cv2.LINE_AA)
                                cv2.imshow('Raw Video Feed', frame)
                                cv2.waitKey(2000)
                            else:
                                cv2.putText(frame, f"Collecting frames for {action} Video Number {sequence}/{num_of_sequences_per_class}", (15,12),
                                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)

                            cv2.imshow('Raw Video Feed', frame)
                            #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

                        else:
                            print(f"<<<WARNING: Frame was skipped...>>>")

                        try:
                            pressedKey = cv2.waitKey(1) & 0xFF
                            if pressedKey == ord('w'):
                                print("Requested to pause. Waiting until 'Esc' is pressed")
                                try:
                                    keyboard.wait('esc')
                                    print("'Esc' was pressed. Continuing...")
                                    time.sleep(1)
                                except:
                                    pass
                            elif cv2.waitKey(1) & 0xFF == ord('q'):
                                print("'q' was pressed. Ending connection with current sequence.")
                                want_to_quit_action = True
                                break
                        except:
                            break
                    except:
                        print("\nFailed in cap.read(). Read description:")
                        print(traceback.format_exc())
                        print("Returning empty stack")
                        cap.release()
                        cv2.destroyAllWindows()
                        print("---")
                        # continue
                        break

                #################################################################
                #'''

                if want_to_quit_action:
                    break

                if len(frame_coordinates_stack) >= sequence_length:
                    # np.savetxt(save_file_as, np.array(frame_coordinates_stack[:sequence_length]), delimiter=",")
                    # np.save(save_file_as, np.array(frame_coordinates_stack[:sequence_length])) # save
                    with open(save_file_as, "w", newline="\n") as f:
                        writer = csv.writer(f)
                        writer.writerows(frame_coordinates_stack[:sequence_length])
                    sequence_counter+=1
                else:
                    print(f"<<<WARNING: Gathered less coordinates than expected: {sequence_counter}/{sequence_length}>>>")
        except:
            print("======================")
            print("Stop triggered:")
            print(traceback.format_exc())
            print("======================")
            break
            
        try:
            cap.release()
        except:
            pass
        try:
            cv2.destroyAllWindows()
        except:
            pass
        continue
        
        print("Hit Enter to Continue to next Action")
        try:
            keyboard.wait('enter')
            print("'Enter' was pressed. Starting with next action in 3 seconds...")
            time.sleep(3)
        except:
            print("Pressed on Stop button. Exiting completely.")
            return
        
        
def collect_keypoints_data():
                    
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
    # Initialization:
    # setup keypoint dataset directory to save:
    dataset_dir = os.path.join(os.getcwd(), "data")
    mkdir_if_none(dataset_dir)    
    
    # setup MediaPipe:
    mp_holistic = mp.solutions.holistic
    print(f"mp.solutions: {mp.solutions}")
    holistic_model = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5, static_image_mode=False)    
    # End of Initiliazation
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                    
    keypoints_data_dir = os.path.join(os.getcwd(), "keypoints_data")
                    
    action_w_source_lst = [["hello", 0], ["world", 0], ["my", 0], ["me", 0],
                           ["every", 0], ["moment", 0], ["is", 0], ["new", 0],
                           ["beginning", 0]]
                    
    num_of_sequences_per_class = 300  # for train: 60*5  # 60seconds * 5 => 5 minutes => 300 sequences
    sequence_length = 30                    
    
    keypoints_detected_np = process_video(holistic_model, mp_holistic, keypoints_data_dir, action_w_source_lst, num_of_sequences_per_class, sequence_length, fps=30)

In [3]:
collect_keypoints_data()

mp.solutions: <module 'mediapipe.python.solutions' from 'C:\\Users\\lrspr\\AppData\\Roaming\\Python\\Python310\\site-packages\\mediapipe\\python\\solutions\\__init__.py'>
Action order: ['hello', 'world', 'my', 'me', 'every', 'moment', 'is', 'new', 'beginning']
Processing video: 0 with 30FPS
Such file (C:\Users\lrspr\Desktop\Masters_Program\690_Deep_Learning\Projects\Project_3_due_12_05_22\keypoints_data\hello\0.csv) already exist. Skipping to next sample...
Such file (C:\Users\lrspr\Desktop\Masters_Program\690_Deep_Learning\Projects\Project_3_due_12_05_22\keypoints_data\hello\1.csv) already exist. Skipping to next sample...
Such file (C:\Users\lrspr\Desktop\Masters_Program\690_Deep_Learning\Projects\Project_3_due_12_05_22\keypoints_data\hello\2.csv) already exist. Skipping to next sample...
Such file (C:\Users\lrspr\Desktop\Masters_Program\690_Deep_Learning\Projects\Project_3_due_12_05_22\keypoints_data\hello\3.csv) already exist. Skipping to next sample...
Such file (C:\Users\lrspr\De

In [13]:
from numpy import genfromtxt

# size validation check, for collection verifications, to check if all rows contained (x,y) values for all 46 points:
def check_sizes(mode):
    data_dir = f"./keypoints_data/{mode}"
    for f in os.listdir(data_dir):
        f_path = os.path.join(data_dir, f)
        for i in os.listdir(f_path):
            file_name = os.path.join(f_path, i)
            tmp_file = genfromtxt(file_name, delimiter=',')
            if tmp_file.shape != (30, 92):
                print("{tmp_file} has different shape: {tmp_file.shape}")
            
check_sizes(mode="train")
check_sizes(mode="val")