# Gesture Recognition using MediaPipe
## Step 1: Setting up video capture
We’ll import OpenCV to set up our video capture, where we’ll see our live feed in an application window.

**Note:**
Video capture frames are in BGR, not RGB, so we’ll have to do convert back and forth before/after processing


In [4]:
import cv2

vid_cap = cv2.VideoCapture(0)
if not vid_cap.isOpened():
	print("Error in opening video capture")
	exit(1)

while True:
	success, image = vid_cap.read()
	if not success:
		print("Error in reading in frames")
		break

	cv2.imshow("Return feed", image)
	if cv2.waitKey(1) & 0xFF == ord('q'):
		break

vid_cap.release()
cv2.destroyAllWindows()

## Step 2: Adding MediaPipe
### Options
- Parameter objects fed to MediaPipe Tasks/Solutions
- You can see we have `BaseOptions` and `GestureRecognitionOptions`, meant for generic MediaPipe models and model specific tuning parameters respectively
- Converting BGR to RGB and back for display



In [5]:
import time
from mediapipe.tasks.python.vision import GestureRecognizer
from mediapipe.tasks.python.vision.core.vision_task_running_mode import VisionTaskRunningMode
from mediapipe.tasks.python import BaseOptions
from mediapipe.tasks.python.vision.gesture_recognizer import GestureRecognizerOptions
import mediapipe as mp

def print_result(result, image, timestamp):
	pass


options = GestureRecognizerOptions(
	base_options=BaseOptions(model_asset_path='assets/gesture_recognizer.task'),
	running_mode=VisionTaskRunningMode.LIVE_STREAM,
	num_hands=2,
	min_tracking_confidence=0.4,
	min_hand_detection_confidence=0.7,
	min_hand_presence_confidence=0.6,
	result_callback=print_result)

vid_cap = cv2.VideoCapture(0)
if not vid_cap.isOpened():
	print("Error in opening video capture")
	exit(1)

with GestureRecognizer.create_from_options(options) as recognizer:
	while True:
		success, image = vid_cap.read()
		if not success:
			print("Error in reading in live frames")
			exit(1)
		image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
		image.flags.writeable = False

		img_arr = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)
		recognizer.recognize_async(img_arr, int(time.time() * 1000))

		image.flags.writeable = True
		image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

		cv2.imshow('Return Feed', image)
		if cv2.waitKey(1) & 0xFF == ord('q'):
			break

vid_cap.release()
cv2.destroyAllWindows()

## Step 3: Drawing on landmarks
We've been able to do the recognition, but there's no visual feedback to verify, analyze, or implement this. Let's change that with landmark drawing

In [6]:
import numpy as np
from mediapipe.python.solutions.drawing_utils import DrawingSpec
from mediapipe.python import solutions
from mediapipe.framework.formats import landmark_pb2


def draw_image_landmarks(og_img, result):
	# we need the hand landmarks, the handedness (for left or right detection), and the gestures properties from our result object
	# Additionally, we need to make a copy of our original frame, we'll annotate on this and display IT instead of the original
	hand_landmarks_list = result.hand_landmarks
	handedness_list = result.handedness
	gesture = result.gestures
	annotated_image = np.copy(og_img)

	# hand_landmarks_list is a nested containing landmarks for each hand detected in the frame (up to max numHands)
	for idx in range(len(hand_landmarks_list)):
		hand_landmarks = hand_landmarks_list[idx]
		handedness = handedness_list[idx]

		# Draw the hand landmarks.
		# Normalize the coordinates of the landmark (x,y,z are all 0 to 1)
		hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
		hand_landmarks_proto.landmark.extend([
			landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
		])

		# Use drawing solution to draw and connect landmarks
		solutions.drawing_utils.draw_landmarks(
			annotated_image,
			hand_landmarks_proto,
			solutions.hands.HAND_CONNECTIONS,
			DrawingSpec(color=(0, 255, 240)),
			DrawingSpec(color=(0, 255, 240)))

		# Get the top left corner of the detected hand's bounding box.
		height, width, _ = annotated_image.shape
		x_coordinates = [landmark.x for landmark in hand_landmarks]
		y_coordinates = [landmark.y for landmark in hand_landmarks]
		text_x = int(min(x_coordinates) * width)
		text_y = int(min(y_coordinates) * height) - 10

		# Draw handedness (left or right hand) on the image.
		sign = gesture[idx][0] if gesture[idx] else ''
		cv2.putText(annotated_image, f"{handedness[0].category_name}:{sign.category_name if sign else ''}",
					(text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
					1, (0, 0, 0), 3, cv2.LINE_AA)

	return annotated_image

Let's also make changes to our existing code to align with this approach

In [9]:
RESULT = None


def print_result(result, image, timestamp):
	global RESULT
	RESULT = result


options = GestureRecognizerOptions(
	base_options=BaseOptions(model_asset_path='assets/gesture_recognizer.task'),
	running_mode=VisionTaskRunningMode.LIVE_STREAM,
	num_hands=2,
	min_tracking_confidence=0.4,
	min_hand_detection_confidence=0.7,
	min_hand_presence_confidence=0.6,
	result_callback=print_result)

vid_cap = cv2.VideoCapture(0)
if not vid_cap.isOpened():
	print("Error in opening video capture")
	exit(1)

with GestureRecognizer.create_from_options(options) as recognizer:
	while True:
		success, image = vid_cap.read()
		if not success:
			print("Error in reading in live frames")
			break

		image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
		image.flags.writeable = False

		timestamp = int(round(time.time() * 1000))
		mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

		recognizer.recognize_async(mp_image, timestamp)

		image.flags.writeable = True
		image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

		if RESULT:
			image = draw_image_landmarks(image, RESULT)

		new_win = cv2.resize(image, (700, 500))
		cv2.imshow('Return Feed', new_win)
		if cv2.waitKey(1) & 0xFF == ord('q'):
			break

vid_cap.release()
cv2.destroyAllWindows()

# ALL DONE