### **Imports**

In [1]:
# OS imports
from os.path import join, realpath, dirname, exists, abspath, isfile, isdir
from os import mkdir, name as os_name, getcwd, environ, pathsep, rename, listdir

# Mediapipe imports
from mediapipe.python.solutions import drawing_utils as du 
from mediapipe.python.solutions import hands
from google.protobuf.json_format import MessageToDict

# Tensorflow & Keras imports
import tensorflow as tf
from tensorflow.python.keras.utils.all_utils import to_categorical
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten
from tensorflow.python.keras.callbacks import TensorBoard
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras import optimizers

# Data handeling/visualization imports
import cv2
from cv2 import imread, imshow, imwrite, flip, cvtColor, COLOR_BGR2RGB
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
from tabulate import tabulate

# General data handeling imports
from numpy import array, zeros, concatenate, save, load, argmax, expand_dims, append
import numpy as np
from uuid import uuid1

### **Definitions**

In [2]:
# Options & Settings 💾
MODEL_VERSION = 0.1
MODEL_ARCH_ID = 'conv1'
MODEL_DATA_SET = 'local1'

MODEL_NAME = f'v{MODEL_VERSION}_arch-{MODEL_ARCH_ID}_data-{MODEL_DATA_SET}'

MP_MODEL_COMPLEXITY = 0
MP_DETECTION_CONFIDENCE = 0.75
MP_TRACKING_CONFIDENCE = 0.75
MP_NUM_HANDS = 1

SIGNS = [
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]

ALL_SIGNS = SIGNS.copy()
ALL_SIGNS.insert(0, 'none')
CLASS_COUNT = len(ALL_SIGNS)

# Constants 🚧
HAND_LANDMARK_COUNT = 21 # https://mediapipe.dev/images/mobile/hand_landmarks.png
HAND_POINT_COUNT = 3 # (x, y, z)
HAND_LANDMARK_POINTS = HAND_LANDMARK_COUNT * HAND_POINT_COUNT # (x, y, z)

# Paths 📁
ROOT_DIR = getcwd()
MODELS_DIR = join(ROOT_DIR, 'models')
MODEL_DIR = join(MODELS_DIR, MODEL_NAME)
LOG_DIR = join(MODEL_DIR, 'logs')
SAVED_MODEL_PATH = join(MODEL_DIR, 'signs.h5')
EXPORTED_MODEL_DIR = join(MODEL_DIR)
DATA_DIR = join(ROOT_DIR, 'data')
IMAGES_DIR = join(ROOT_DIR, 'images')
TEST_IMAGES_DIR = join(IMAGES_DIR, 'test')
COLLECTED_IMAGES_DIR = join(IMAGES_DIR, 'collected')
PREPROCESSED_IMAGES_DIR = join(IMAGES_DIR, 'preprocessed')
PROCCESSED_IMAGES_DIR = join(IMAGES_DIR, 'processed')
REJECTED_IMAGES_DIR = join(IMAGES_DIR, 'rejected')

# Global Utils 📐
def create_dir(dir_path: str, notify: bool = True):
  if not exists(dir_path):
    mkdir(dir_path)
  else:
    if notify: print(f'{dir_path} already exists!')

def dir_exists(dir_path: str) -> bool:
  return exists(dir_path) and isdir(dir_path)

EMPTY_KEYPOINTS = zeros((HAND_LANDMARK_COUNT, 3))

### **Mediapipe Utilities**

In [None]:
from typing import NamedTuple, Tuple
from numpy import ndarray

def draw_landmarks(image, results):
  if not results.multi_hand_landmarks: return image
  
  hand_landmarks = results.multi_hand_landmarks
  
  for point in hand_landmarks:
    du.draw_landmarks(
      image, point, hands.HAND_CONNECTIONS, 
      du.DrawingSpec(color=(119, 252, 3), thickness=1, circle_radius=2), 
      du.DrawingSpec(color=(119, 252, 3), thickness=1, circle_radius=2)
    )
    
def draw_img_landmarks(image, hand_landmarks):
  for point in hand_landmarks:
    du.draw_landmarks(
      image, point, hands.HAND_CONNECTIONS, 
      du.DrawingSpec(color=(224,0,0), thickness=32, circle_radius=5), # points
      du.DrawingSpec(color=(0,0,224), thickness=32, circle_radius=5) # edges
    )

def mediapipe_detection(image, hands: hands.Hands):
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # COLOR CONVERSION BGR 2 RGB
  image.flags.writeable = False                   # Image is no longer writeable
  results = hands.process(image)                  # Make prediction
  image.flags.writeable = True                    # Image is now writeable
  image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # COLOR COVERSION RGB 2 BGR
  return image, results

def get_handedness(results: NamedTuple) -> str:
  return MessageToDict(results.multi_handedness[0])['classification'][0]['label']

def is_right_hand(results: NamedTuple) -> bool:
  return get_handedness(results) == 'Right'

def process_image(img_path: str):
  if not exists(img_path): raise Exception('invalid img_path')

  with hands.Hands(
    model_complexity=MP_MODEL_COMPLEXITY,
    min_detection_confidence=MP_DETECTION_CONFIDENCE,
    min_tracking_confidence=MP_TRACKING_CONFIDENCE,
    max_num_hands=1
  ) as mp_hands:
    image = flip(imread(img_path), 1)
    image, results = mediapipe_detection(image, mp_hands)

def extract_absolute_keypoints(results: NamedTuple) -> ndarray:
  """
  Recives raw mediapipe results tuple and produces a NDArray with the absolute positions of each landmark detected (if there are any)
  
  @param `results` mediapipe's hand solution output
  @returns ndarray with shape of (21, 3) = (amount of landmarks in a single hand, 3 dimensional position (x, y,z))
  """
  if not results.multi_hand_landmarks:
    return EMPTY_KEYPOINTS
  landmarks = MessageToDict(results.multi_hand_world_landmarks[0])['landmark']
  res = []
  for landmark in landmarks:
    res.append([
      landmark['x'],
      landmark['y'],
      landmark['z'],
    ])
  return array(res)

# TODO!
def normalize_landmarks(arr: ndarray):
  """
  Transforms array of absolute positions into an array of normalized position within range [-1, 1]

  @param `landmarks` an array of landmark positions of shape (21, 3)
  @returns transformned NDArray of floats with shape (21, 3)
  """
  # Compute the minimum and maximum values separately for each dimension
  min_x, max_x = np.min(arr[:, 0]), np.max(arr[:, 0])
  min_y, max_y = np.min(arr[:, 1]), np.max(arr[:, 1])
  min_z, max_z = np.min(arr[:, 2]), np.max(arr[:, 2])

  # Normalize each dimension separately
  arr[:, 0] = (arr[:, 0] - min_x) / (max_x - min_x)
  arr[:, 1] = (arr[:, 1] - min_y) / (max_y - min_y)
  arr[:, 2] = (arr[:, 2] - min_z) / (max_z - min_z)

  return arr

def print_landmark_values(data: ndarray):
  print(tabulate(data, headers=["x", "y", "z"], floatfmt=".2f"))

### **Live Image Capture w/ Mediapipe**

In [None]:
cap = cv2.VideoCapture(0)

selected_sign_index = 0
selected_sign = SIGNS[selected_sign_index]
selected_sign_amount = len(listdir(join(DATA_DIR, selected_sign)))

def select_next_sign():
  global selected_sign, selected_sign_index, selected_sign_amount
  if selected_sign_index < len(SIGNS) - 1:
    selected_sign_index += 1
    selected_sign = SIGNS[selected_sign_index]
    selected_sign_amount = len(listdir(join(DATA_DIR, selected_sign)))

def select_prev_sign():
  global selected_sign, selected_sign_index, selected_sign_amount
  if selected_sign_index > 0:
    selected_sign_index -= 1
    selected_sign = SIGNS[selected_sign_index]
    selected_sign_amount = len(listdir(join(DATA_DIR, selected_sign)))

def capture(results):
  global selected_sign, selected_sign_amount
  if not results.multi_hand_landmarks:
    print('no hand available')
    return
  if not is_right_hand(results):
    print('must use right hand')
    return

  keypoints = extract_absolute_keypoints(results)
  data_path = join(DATA_DIR, selected_sign, f'{selected_sign}_{selected_sign_amount}.{uuid1()}')
  save(data_path, keypoints)
  print(f'saved at {data_path}')
  selected_sign_amount = len(listdir(join(DATA_DIR, selected_sign)))

with hands.Hands(
  model_complexity=MP_MODEL_COMPLEXITY,
  min_detection_confidence=MP_DETECTION_CONFIDENCE,
  min_tracking_confidence=MP_TRACKING_CONFIDENCE,
  max_num_hands=MP_NUM_HANDS
) as mp_hands:
  while cap.isOpened():
    
    success, image = cap.read()
    image = flip(image, 1)
    
    if not success:
      print("Ignoring empty camera frame.")
      continue

    image, results = mediapipe_detection(image, mp_hands)
    draw_landmarks(image, results)
    image = cv2.putText(
      image, f'{selected_sign} | {selected_sign_amount}',
      (32, 32), cv2.FONT_HERSHEY_SIMPLEX, 1, (119, 252, 3), 2, cv2.LINE_AA
    )
      
    # Flip the image horizontally for a selfie-view display.
    imshow('signs', image)
    
    key = cv2.waitKeyEx(1)

    if key == 32: # space bar
      capture(results)

    if key == 2555904: # right key
      select_next_sign()

    if key == 2424832: # left key
      select_prev_sign()
      
    if key == 27:
      break
  
cap.release()
cv2.destroyAllWindows()

*generate folders*

In [6]:
for s in SIGNS:
  create_dir(join(DATA_DIR, s), False)

### **Models**

In [None]:
def conv1():
  model = Sequential()

  model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(21, 3)))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Flatten())
  model.add(Dense(128, activation='relu'))
  model.add(Dense(CLASS_COUNT, activation='softmax'))

  return model

### **Image Processing**

### **Training**

##### *Loading & Partitioning data*

In [None]:
def get_collection_count():
  data_amounts = []
  for collection_dir in listdir(DATA_DIR):
    if collection_dir == '.gitkeep': continue
    data_amounts.append(len(listdir(join(DATA_DIR, collection_dir))))
      
  return min(data_amounts)

COLLECTION_COUNT = get_collection_count()
print(f'currently using {COLLECTION_COUNT} data points')

# Load Training Data
label_map = { label: num for num, label in enumerate(ALL_SIGNS) }
sequences, labels = zeros((COLLECTION_COUNT, 21, 3)), [ 0 for i in range(COLLECTION_COUNT) ]

for sign in SIGNS:
  sign_data_dir = join(DATA_DIR, sign)
  for data_file_name in listdir(sign_data_dir)[:COLLECTION_COUNT]:
    data_path = join(sign_data_dir, data_file_name)
    res = load(data_path)
    sequence = append(sequences, res)
    labels.append(label_map[sign])

x = array(sequences)
y = to_categorical(labels).astype(int)

# Testing!
s_expected = (CLASS_COUNT * COLLECTION_COUNT, HAND_LANDMARK_COUNT, HAND_POINT_COUNT)
s_result = x.shape
l_expected = (CLASS_COUNT * COLLECTION_COUNT, CLASS_COUNT)
l_result = y.shape
if s_result != s_expected:
  raise Exception(f'WARNING: expected sequence shape `{s_expected}` != from gotten `{s_result}`')
if l_result != l_expected:
  raise Exception(f'WARNING: expected labels shape `{l_expected}` != from gotten `{l_result}`')

# partitioning train, test, validation data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1 * 1.0)            # 10% test data (1.0 * 0.20 = 0.1 => 10%)
x_train, x_val,  y_train, y_val  = train_test_split(x_train, y_train, test_size=0.1/0.9)  # 10% val  data (0.9 * 0.11 = 0.1 => 10%)

### **Testing**

In [None]:
test_image_path = join(TEST_IMAGES_DIR, '1.png')

