In [1]:
import cv2
import mediapipe as mp
import tensorflow as tf
import time
import numpy as np
import copy
import itertools
import torch
import math


%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


## Models

In [2]:
# load media pipe drawing solutions
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
# load media pipe hand detection model
mp_hands = mp.solutions.hands

In [3]:
# load hand keypoints classifier
model_path = "models/keypoint_classifier.tflite"
class KeyPointClassifier(object):
    """
    Classify hand keys points into 8 gestures
    
    Note: the classification model and class has been taken and refactored from https://github.com/kinivi/tello-gesture-control
    """
    def __init__(
        self,
        model_path="models/keypoint_classifier.tflite",
        
    ):
        self.interpreter = tf.lite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

    def __call__(
        self,
        frame,
        hand_landmarks,
    ):
        # Landmark calculation
        landmark_list = self._calc_landmark_list(frame, hand_landmarks)

        # Conversion to relative coordinates / normalized coordinates
        pre_processed_landmark_list = self._pre_process_landmark(landmark_list)

        input_details_tensor_index = self.input_details[0]['index']
        self.interpreter.set_tensor(
            input_details_tensor_index,
            np.array([pre_processed_landmark_list], dtype=np.float32))
        self.interpreter.invoke()

        output_details_tensor_index = self.output_details[0]['index']

        result = self.interpreter.get_tensor(output_details_tensor_index)

        result_index = np.argmax(np.squeeze(result))

        return result_index
    
    def _pre_process_landmark(self, landmark_list):
        temp_landmark_list = copy.deepcopy(landmark_list)

        # Convert to relative coordinates
        base_x, base_y = 0, 0
        for index, landmark_point in enumerate(temp_landmark_list):
            if index == 0:
                base_x, base_y = landmark_point[0], landmark_point[1]

            temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
            temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y

        # Convert to a one-dimensional list
        temp_landmark_list = list(
            itertools.chain.from_iterable(temp_landmark_list))

        # Normalization
        max_value = max(list(map(abs, temp_landmark_list)))

        def normalize_(n):
            return n / max_value

        temp_landmark_list = list(map(normalize_, temp_landmark_list))

        return temp_landmark_list
    
    def _calc_landmark_list(self, image, landmarks):
            image_width, image_height = image.shape[1], image.shape[0]

            landmark_point = []

            # Keypoint
            for _, landmark in enumerate(landmarks.landmark):
                landmark_x = min(int(landmark.x * image_width), image_width - 1)
                landmark_y = min(int(landmark.y * image_height), image_height - 1)
                # landmark_z = landmark.z

                landmark_point.append([landmark_x, landmark_y])

            return landmark_point


# initialize hand keypoint classifier
key_point_classifier = KeyPointClassifier(model_path)

INFO: Initialized TensorFlow Lite runtime.


In [4]:
# load yolo v5 nano model (for person detection)
#yolo = torch.hub.load('ultralytics/yolov5', 'custom', 'yolov5n.mlmodel') # apple optimized
yolo = torch.hub.load('ultralytics/yolov5', 'yolov5n')

# limit yolo model to only detect persons
yolo.classes = [0]

Using cache found in /Users/hugo/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-4-23 torch 1.11.0 CPU

Fusing layers... 
[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients
Adding AutoShape... 


## Helper functions

In [5]:
def calc_bounding_rect(image, landmarks):
    # Calculate bounding box from hand landmarks
    image_width, image_height = image.shape[1], image.shape[0]

    landmark_array = np.empty((0, 2), int)

    for _, landmark in enumerate(landmarks.landmark):
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)

        landmark_point = [np.array((landmark_x, landmark_y))]

        landmark_array = np.append(landmark_array, landmark_point, axis=0)

    x, y, w, h = cv2.boundingRect(landmark_array)

    return [x, y, x + w, y + h]

def calc_center(brect):
    # calculate center of hand bounding box
    return (brect[0] + brect[2]) / 2, (brect[1] + brect[3]) / 2

def draw_info(image, brect, hand_sign_text = ""):
    # draw bounding box of hand
    # Outer rectangle
    cv2.rectangle(image, (brect[0], brect[1]), (brect[2], brect[3]),
                    (0, 0, 0), 3)

    # Text
    cv2.rectangle(image, (brect[0], brect[1]), (brect[2], brect[1] - 22),
                     (0, 0, 0), -1)
    cv2.putText(image, hand_sign_text, (brect[0] + 5, brect[1] - 4),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA)

def draw_person(image, p):
    # draw bounding box of person
    cv2.rectangle(image, (int(p.xmin), int(p.ymin)), (int(p.xmax), int(p.ymax)), (0, 0, 255), 2)

def in_bounding_box(p, bbox):
    # check if point is in bounding box
    return p[0] >= bbox[0] and p[0] <= bbox[2] and p[1] >= bbox[1] and p[1] <= bbox[3]

def euclidian_distance(p1, p2):
    # calculate euclidian distance between two points
    return math.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)

## Resulting Milestone 1

In [6]:
# init values for FPS calculation
prev_frame_time = 0
new_frame_time = 0

# start video capture
cap = cv2.VideoCapture(0)

with mp_hands.Hands(
  model_complexity=0,
  min_detection_confidence=0.6,
  min_tracking_confidence=0.5,
  max_num_hands=4) as hands:
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      continue
    
    # flip image for selfie mode
    image = cv2.flip(image, 1)

    # Process the image in a non-writable way (faster) + convert to RGB to detect hands
    img_copy = image.copy()
    img_copy.flags.writeable = False
    img_copy = cv2.cvtColor(img_copy, cv2.COLOR_BGR2RGB)
    results = hands.process(img_copy)

    # detect persons in the frame
    yolo_pred = yolo(image)
    persons = yolo_pred.pandas().xyxy[0]

    # if hands detected
    if results.multi_hand_landmarks:
      hand_centers = []
      brects = []
      # for each hand
      for hand_landmarks in results.multi_hand_landmarks:
        # draw hand keypoins
        mp_drawing.draw_landmarks(
          image,
          hand_landmarks,
          mp_hands.HAND_CONNECTIONS,
          mp_drawing_styles.get_default_hand_landmarks_style(),
          mp_drawing_styles.get_default_hand_connections_style())
        
        # classify keypoints
        gesture_idx = key_point_classifier(image, hand_landmarks)
        # if sign detected (== 2, "up")
        if gesture_idx == 2:
          # calculate bounding box of hand
          brect = calc_bounding_rect(image, hand_landmarks)
          # save center of hand bounding box
          hand_center = calc_center(brect)
          hand_centers.append(hand_center)
          brects.append(brect)
      # if the sign is detected more than once  
      if len(hand_centers) > 0:
        for _, p in persons.iterrows():
          rel_hand_centers = []
          # check if at least two hands are in the person bounding box (and discard too close hand detections)
          for hand_center in hand_centers:
            dup = False
            for rhc in rel_hand_centers:
              if euclidian_distance(hand_center, rhc) < 50:
                dup = True
                break
            if in_bounding_box(hand_center, [p.xmin, p.ymin, p.xmax, p.ymax]) and not dup:
              rel_hand_centers.append(hand_center)
              draw_info(image, brects[hand_centers.index(hand_center)], "Sign detected")
          if len(rel_hand_centers) > 1:
            draw_person(image, p)
            break

    # Calculate FPS + show on frame
    new_frame_time = time.time()
    fps = 1/(new_frame_time-prev_frame_time)
    prev_frame_time = new_frame_time
    fps = str(int(fps))
    cv2.putText(image, fps, (7, 70), cv2.FONT_HERSHEY_SIMPLEX, 3, (100, 255, 0), 3, cv2.LINE_AA)

    cv2.imshow('MediaPipe Hands', image)
    if cv2.waitKey(5) & 0xFF == 27:
      break
cap.release()

INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


KeyboardInterrupt: 