In [18]:
# clone nanodet repo (cpu fork)
!git clone https://github.com/HugoCasa/nanodet-cpu.git
%cd nanodet-cpu
# install nanodet requirements
#!pip install -r requirements.txt # only need once
!python setup.py develop

Cloning into 'nanodet-cpu'...
remote: Enumerating objects: 2517, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 2517 (delta 0), reused 0 (delta 0), pack-reused 2515[K
Receiving objects: 100% (2517/2517), 5.23 MiB | 14.80 MiB/s, done.
Resolving deltas: 100% (1472/1472), done.
/Users/hugo/projects/dlav/nanodet-cpu
running develop
running egg_info
creating nanodet.egg-info
writing nanodet.egg-info/PKG-INFO
writing dependency_links to nanodet.egg-info/dependency_links.txt
writing top-level names to nanodet.egg-info/top_level.txt
writing manifest file 'nanodet.egg-info/SOURCES.txt'
reading manifest file 'nanodet.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'nanodet.egg-info/SOURCES.txt'
running build_ext
Creating /Users/hugo/opt/anaconda3/envs/yolox/lib/python3.7/site-packages/nanodet.egg-link (link to .)
Adding nanodet 1.0.0a0 to easy-install.pth file

Installed /Users/hugo/projects/dlav/

In [19]:
import cv2
import mediapipe as mp
import tensorflow as tf
import time
import torch
import itertools
import numpy as np
import copy

In [20]:
from nanodet.util import Logger, cfg, load_config
from demo.demo import Predictor

In [21]:
load_config(cfg, 'config/legacy_v0.x_configs/nanodet-m-0.5x.yml')
logger = Logger(0, use_tensorboard=False)
predictor = Predictor(cfg, '../models/nanodet_m_0.5x.ckpt', None, device="cpu")

model size is  0.5x
init weights...
=> loading pretrained model https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth
Finish initialize NanoDet Head.


In [22]:
# load media pipe drawing solutions
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
# load media pipe hand detection model
mp_hands = mp.solutions.hands

In [23]:
# load hand keypoints classifier
model_path = "../models/keypoint_classifier.tflite"
class KeyPointClassifier(object):
    """
    Classify hand keys points into 8 gestures
    
    Note: the classification model and class has been taken and refactored from https://github.com/kinivi/tello-gesture-control
    """
    def __init__(
        self,
        model_path="models/keypoint_classifier.tflite",
        
    ):
        self.interpreter = tf.lite.Interpreter(model_path=model_path)
        self.interpreter.allocate_tensors()
        self.input_details = self.interpreter.get_input_details()
        self.output_details = self.interpreter.get_output_details()

    def __call__(
        self,
        frame,
        hand_landmarks,
    ):
        # Landmark calculation
        landmark_list = self._calc_landmark_list(frame, hand_landmarks)

        # Conversion to relative coordinates / normalized coordinates
        pre_processed_landmark_list = self._pre_process_landmark(landmark_list)

        input_details_tensor_index = self.input_details[0]['index']
        self.interpreter.set_tensor(
            input_details_tensor_index,
            np.array([pre_processed_landmark_list], dtype=np.float32))
        self.interpreter.invoke()

        output_details_tensor_index = self.output_details[0]['index']

        result = self.interpreter.get_tensor(output_details_tensor_index)

        result_index = np.argmax(np.squeeze(result))

        return result_index
    
    def _pre_process_landmark(self, landmark_list):
        temp_landmark_list = copy.deepcopy(landmark_list)

        # Convert to relative coordinates
        base_x, base_y = 0, 0
        for index, landmark_point in enumerate(temp_landmark_list):
            if index == 0:
                base_x, base_y = landmark_point[0], landmark_point[1]

            temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
            temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y

        # Convert to a one-dimensional list
        temp_landmark_list = list(
            itertools.chain.from_iterable(temp_landmark_list))

        # Normalization
        max_value = max(list(map(abs, temp_landmark_list)))

        def normalize_(n):
            return n / max_value

        temp_landmark_list = list(map(normalize_, temp_landmark_list))

        return temp_landmark_list
    
    def _calc_landmark_list(self, image, landmarks):
            image_width, image_height = image.shape[1], image.shape[0]

            landmark_point = []

            # Keypoint
            for _, landmark in enumerate(landmarks.landmark):
                landmark_x = min(int(landmark.x * image_width), image_width - 1)
                landmark_y = min(int(landmark.y * image_height), image_height - 1)
                # landmark_z = landmark.z

                landmark_point.append([landmark_x, landmark_y])

            return landmark_point


# initialize hand keypoint classifier
key_point_classifier = KeyPointClassifier(model_path)

INFO: Initialized TensorFlow Lite runtime.


In [24]:
def calc_bounding_rect(image, landmarks):
    # Calculate bounding box from hand landmarks
    image_width, image_height = image.shape[1], image.shape[0]

    landmark_array = np.empty((0, 2), int)

    for _, landmark in enumerate(landmarks.landmark):
        landmark_x = min(int(landmark.x * image_width), image_width - 1)
        landmark_y = min(int(landmark.y * image_height), image_height - 1)

        landmark_point = [np.array((landmark_x, landmark_y))]

        landmark_array = np.append(landmark_array, landmark_point, axis=0)

    x, y, w, h = cv2.boundingRect(landmark_array)

    return [x, y, x + w, y + h]

def calc_center(brect):
    # calculate center of hand bounding box
    return (brect[0] + brect[2]) / 2, (brect[1] + brect[3]) / 2

def draw_info(image, brect, hand_sign_text = ""):
    # draw bounding box of hand
    # Outer rectangle
    cv2.rectangle(image, (brect[0], brect[1]), (brect[2], brect[3]),
                    (0, 0, 0), 3)

    # Text
    cv2.rectangle(image, (brect[0], brect[1]), (brect[2], brect[1] - 22),
                     (0, 0, 0), -1)
    cv2.putText(image, hand_sign_text, (brect[0] + 5, brect[1] - 4),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv2.LINE_AA)

def draw_person(image, box):
    x0 = int(box[0])
    y0 = int(box[1])
    x1 = int(box[2])
    y1 = int(box[3])

    color = (0, 0, 255)
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.rectangle(image, (x0, y0), (x1, y1), color, 2)

def in_bounding_box(p, bbox):
    # check if point is in bounding box
    return p[0] >= bbox[0] and p[0] <= bbox[2] and p[1] >= bbox[1] and p[1] <= bbox[3]

In [26]:
# init values for FPS calculation
prev_frame_time = 0
new_frame_time = 0


# start video capture
cap = cv2.VideoCapture(0)
with mp_hands.Hands(
  model_complexity=0,
  min_detection_confidence=0.5,
  min_tracking_confidence=0.5,
  max_num_hands=4) as hands:
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      continue
    
    # flip image for selfie mode
    image = cv2.flip(image, 1)
    prev_time = time.time()
    operations = []
    # Process the image in a non-writable way (faster) + convert to RGB to detect hands
    img_copy = image.copy()
    img_copy.flags.writeable = False
    img_copy = cv2.cvtColor(img_copy, cv2.COLOR_BGR2RGB)
    results = hands.process(img_copy)
  
    # if hands detected
    if results.multi_hand_landmarks:
      hand_centers = []
      # for each hand
      for hand_landmarks in results.multi_hand_landmarks:
        # draw hand keypoins
        mp_drawing.draw_landmarks(
          image,
          hand_landmarks,
          mp_hands.HAND_CONNECTIONS,
          mp_drawing_styles.get_default_hand_landmarks_style(),
          mp_drawing_styles.get_default_hand_connections_style())
        
        # classify keypoints
        gesture_idx = key_point_classifier(image, hand_landmarks)
        # if sign detected (== 2, "up")
        if gesture_idx == 2:
          # draw bounding box of hand
          brect = calc_bounding_rect(image, hand_landmarks)
          draw_info(image, brect, "Sign detected")
          # save center of hand bounding box
          hand_center = calc_center(brect)
          hand_centers.append(hand_center)
      # if the sign is detected (hand_center defined)   
      if len(hand_centers) > 1:
        # detect persons in the frame
        meta, res = predictor.inference(image);
        dets = res[0]
        bboxes = []
        distances = []
        for label in dets:
          if label == 0:
            for bbox in dets[label]:
              score = bbox[-1]
              if score > 0.35:
                rel_hand_centers = []
                for hand_center in hand_centers:
                  if in_bounding_box(hand_center, bbox[:4]):
                    rel_hand_centers.append(hand_center)
                if len(rel_hand_centers) > 1:
                  draw_person(image, bbox[:4])

    # Calculate FPS + show on frame
    new_frame_time = time.time()
    fps = 1/(new_frame_time-prev_frame_time)
    prev_frame_time = new_frame_time
    fps = str(int(fps))
    cv2.putText(image, fps, (7, 70), cv2.FONT_HERSHEY_SIMPLEX, 3, (100, 255, 0), 3, cv2.LINE_AA)

    cv2.imshow('MediaPipe Hands', image)
    if cv2.waitKey(5) & 0xFF == 27:
      break
cap.release()

forward time: 0.018s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.018s | decode time: 0.005s | forward time: 0.018s | decode time: 0.004s | forward time: 0.018s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.018s | decode time: 0.005s | forward time: 0.018s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.004s | forward time: 0.017s | decode time: 0.005s | forward time: 0.017s | decode time: 0.005s | forward time: 0.018s | decode time: 0.004s | forward time: 0.017s | decode time: 0.005s | 

KeyboardInterrupt: 