# README

Minimum notebook that allows you to run the demo locally. Automatically loads pretrained SVM and NN from google drive so that you do not need to train them.

Note: net_9d_scaling seems to be more resistant to hands that are far from the camera.

Note: demo now runs a neural network. Demo can be switched to run SVM by changing the TODOs.

Note: Only run this file on your local computer

This file is described as the game playing program in our paper

# Imports

In [None]:
%pip install gdown -q
# %pip install datasets
%pip install mediapipe -q

In [None]:
# from datasets import load_dataset
#mediapipe dependencies
import mediapipe as mp

#general dependencies
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import cv2
from PIL import Image
import shutil
import os
import pickle

# Utility Functions

In [None]:
# Stringify a numeric label into textual form
def decode_label(label):
    if label == 1:
        return "rock"
    elif label == 0:
        return "paper"
    elif label == 2:
        return "scissors"
    elif label == 3:
        return "none"
    return None
    # return "rock" if label == 1 else "paper" if label == 0 else "scissors"

# grey scale images (channel 1)
# def grey_scale():
#   transform = transforms.Compose([
#       transforms.Resize(256),
#       transforms.Grayscale(num_output_channels=1),
#   ])
#   return transform

# returns a transformed image of shape (300, 300, 3)
def rgb_image_transform(image): # input type PIL.PngImagePlugin.PngImageFile
  resized_image = image.resize((300, 300))
  if resized_image.mode != 'RGB':
    resized_image = resized_image.convert('RGB')

  return resized_image

# returns a numpy (tensor) of an image
def image_numpy(image):
  np_image = np.array(image)

  return np_image

#Utility from Henry
from scipy.spatial.distance import euclidean
import numpy as np
import re
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

def parse_landmarks(landmarks_str):
    matches = re.findall(r'landmark \{\s*x: ([e\d.-]+)\s*y: ([e\d.-]+)\s*z: ([e\d.-]+)\s*\}', landmarks_str)
    landmarks = [(float(x), float(y), float(z)) for x, y, z in matches]
    return landmarks

def calculate_distances(landmarks):
    wrist = landmarks[0]
    fingertips_indexes = [4, 8, 12, 16]
    distances = [euclidean(wrist, landmarks[i]) for i in fingertips_indexes]
    return distances

def calculate_9d(landmarks):
    wrist = landmarks[0]
    fingertips_indexes = [4, 8, 12, 16, 20, 5, 9, 13, 17]
    distances = [euclidean(wrist, landmarks[i]) for i in fingertips_indexes]
    return distances

# Start Live Demo using SVM of Joint as the model (only in Jupyter notebook)

### load the models

In [None]:
mp_drawing = mp.solutions.drawing_utils  # used to draw landmarks
mp_hands = mp.solutions.hands  # used to get landmarks from a photo

In [None]:
# download svm_9d.pkl
# !gdown --fuzzy https://drive.google.com/file/d/1-8hm-quWlYfQeXkMeR_lhkMxqV-d-Nkq/view?usp=drive_link
!gdown 1AD0EYalbRzWK86uJkHv1QQCuCHHWxuu7

In [None]:
from sklearn.svm import SVC
with open("svm.pkl", 'rb') as f:
    svm_model = pickle.load(f)

Note: the following loads NNs

In [None]:
import torch
import torch.nn as nn

class Net(nn.Module):
    def __init__(self, input_features=5, hidden_layers=64):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_features, hidden_layers)  # Input layer
        self.drop1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_layers, hidden_layers)
        self.drop2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(hidden_layers, 3)  # Output layer

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Activation function for the input layer
        # x = self.drop1(x)
        x = torch.relu(self.fc2(x))  # No activation function for the output layer
        # x = self.drop2(x)
        x = self.fc3(x)
        return x

In [None]:
!gdown 16IJoblVT0gGrbZLU6stAyIvhG5EDR9Zm

In [None]:
net = Net(input_features=9, hidden_layers=64)
net.load_state_dict(torch.load("net_9d_scaling.pt"))

### start live feed

In [None]:
import cv2
import numpy as np

In [None]:
# run this to use nn

def predict_model_nn(net, input):
    return decode_label(np.argmax(net(torch.tensor(input).float()).detach().numpy()))


In [None]:
# run this to use svm

def predict_model(model, input):
    return decode_label(model.predict(np.array([input]))[0])

In [None]:


def get_label(index, hand, results):
    output = None
    for idx, classification in enumerate(results.multi_handedness):
        if classification.classification[0].index == index:
            label = classification.classification[0].label
            score = classification.classification[0].score
            text = '{} {}'.format(label, round(score, 2))

            # print(str(results.multi_hand_landmarks))
            landmarks = parse_landmarks(str(hand))

            # TODO CHANGE THIS to use 9d
            # distance = calculate_distances(landmarks)
            distance = calculate_9d(landmarks)
            sum_distance = 0
            if isinstance(distance, list):
                sum_distance = np.sum(distance)

            # TODO CHANGE THIS to use svm
            # type = predict_model(svm_model, distance)
            type = predict_model_nn(net, distance)

            coords = tuple(np.multiply(np.array((hand.landmark[mp_hands.HandLandmark.WRIST].x, hand.landmark[mp_hands.HandLandmark.WRIST].y)),
            [640, 480]).astype(int))

            output = text, coords, type, sum_distance
    return output

In [None]:
# rgb_image_transform
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Unable to open webcam")
    exit()

with mp_hands.Hands(min_detection_confidence=0.8, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read() # ret : return value, frame: the image frame from webcam

        # BGR to RGB
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        image = cv2.flip(image, 1)

        image.flags.writeable = False

        results = hands.process(image)

        image.flags.writeable = True

        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        # print(results)
        # Detection
        if results.multi_hand_landmarks:
            first_hand = ''
            second_hand = ''
            winner = ''
            for num, hand in enumerate(results.multi_hand_landmarks):
                # Choose a different color for each landmark
                color = (0, 255, 0)  # Default color
                if num == 0:
                    color = (255, 0, 0)  # Change color for the first landmark
                elif num == 1:
                    color = (0, 0, 255)  # Change color for the second landmark
                # Draw the landmark
                mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,
                                       mp_drawing.DrawingSpec(color=color, thickness=2, circle_radius=4),
                                       mp_drawing.DrawingSpec(color=color, thickness=2, circle_radius=2))


                # render left or right detection
                if get_label(num, hand, results): # type : rock, paper, scissor
                    text, coord, type, sum = get_label(num, hand, results)
                    text_with_type = f'{text} - {type}'
                    cv2.putText(image, text_with_type, coord, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)
                    if (num == 0): # first hand
                        first_hand = type
                    else:
                        second_hand = type

                if first_hand == second_hand:
                    winner = "It's a tie!"
                else:
                    if (first_hand == "rock" and second_hand == "scissor") or \
                       (first_hand == "scissor" and second_hand == "paper") or \
                       (first_hand == "paper" and second_hand == "rock"):
                        winner = first_hand
                    else:
                        winner = second_hand

                image_height, image_width, _ = image.shape
                cv2.putText(image, winner, (int(image_width / 2) - 20, image_height - 100), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)


        cv2.imshow('Joint detection', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()




In [None]:
# Run this cell if any windows linger
cap.release()
cv2.destroyAllWindows()