In [67]:
import pynput
import numpy as np
import json
import os

import cv2

from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, AveragePooling2D, RandomFlip, RandomContrast, RandomRotation, RandomBrightness
from keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.metrics import categorical_crossentropy

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split


In [80]:
GESTURES = ['rock', 'ok', 'like', 'dislike', 'peace']
OUTLIERS = ['two_up', 'fist', 'stop', 'one', 'three']
SIZE = [64, 64]
CHANNELS = 3
PATH = "gesture_dataset_sample"


In [69]:
# COPIED FROM EXERCISE
annotations = dict()

for condition in GESTURES + OUTLIERS:
    with open(f'{PATH}/_annotations/{condition}.json') as f:
        annotations[condition] = json.load(f)

In [70]:
# COPIED FROM EXERCISE
def preprocess_image(img):
    if CHANNELS == 1:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_resized = cv2.resize(img, SIZE)
    return img_resized

In [71]:
#COPIED FROM EXERCISE

images = [] # stores actual image data
labels = [] # stores labels (as integer - because this is what our network needs)
label_names = [] # maps label ints to their actual categories so we can understand predictions later

# loop over all conditions
# loop over all files in the condition's directory
# read the image and corresponding annotation
# crop image to the region of interest
# preprocess image
# store preprocessed image and label in corresponding lists
for condition in GESTURES + OUTLIERS:
    for filename in tqdm(os.listdir(f'{PATH}/{condition}')):
        # extract unique ID from file name
        UID = filename.split('.')[0]
        img = cv2.imread(f'{PATH}/{condition}/{filename}')
        
        # get annotation from the dict we loaded earlier
        try:
            annotation = annotations[condition][UID]
        except Exception as e:
            print(e)
            continue
        
        # iterate over all hands annotated in the image
        for i, bbox in enumerate(annotation['bboxes']):
            # annotated bounding boxes are in the range from 0 to 1
            # therefore we have to scale them to the image size
            x1 = int(bbox[0] * img.shape[1])
            y1 = int(bbox[1] * img.shape[0])
            w = int(bbox[2] * img.shape[1])
            h = int(bbox[3] * img.shape[0])
            x2 = x1 + w
            y2 = y1 + h
            
            # crop image to the bounding box and apply pre-processing
            crop = img[y1:y2, x1:x2]
            preprocessed = preprocess_image(crop)
            
            # get the annotated hand's label
            # if we have not seen this label yet, add it to the list of labels
            label = annotation['labels'][i]
            if label in OUTLIERS:
                 label = "no_gesture"
            
            if label not in label_names:
                    label_names.append(label)
            
            label_index = label_names.index(label)
            
            images.append(preprocessed)
            labels.append(label_index)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

In [81]:
#labels.count(label_names.index("no_gesture"))
#len(labels)
label_names


['rock', 'no_gesture', 'ok', 'like', 'dislike', 'peace']

In [82]:
# COPIED FROM EXERCISE
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=6)

In [83]:
np.unique(y_test)

array([0, 1, 2, 3, 4, 5])

In [84]:
# COPIED FROM EXERCISE
X_train = np.array(X_train).astype('float32')
X_train = X_train / 255.

X_test = np.array(X_test).astype('float32')
X_test = X_test / 255.

y_train_one_hot = to_categorical(y_train, num_classes=6)
y_test_one_hot = to_categorical(y_test, num_classes=6)

train_label = y_train_one_hot
test_label = y_test_one_hot

X_train = X_train.reshape(-1, SIZE[0], SIZE[0], CHANNELS)
X_test = X_test.reshape(-1, SIZE[0], SIZE[0], CHANNELS)

print(X_train.shape, X_test.shape, train_label.shape, test_label.shape)

(2586, 64, 64, 3) (647, 64, 64, 3) (2586, 6) (647, 6)


In [85]:
len(X_train)

2586

## Hyperparameters

In [86]:
batch_size = 12
epochs = 50

num_classes = len(label_names)
activation = "relu"
activation_conv = "leaky_relu"

layer_count = 3
num_neurons = 576

In [87]:


model = Sequential()

model.add(RandomFlip('horizontal'))
model.add(RandomBrightness(0.1))
model.add(RandomContrast(0.1))

model.add(Conv2D(64, kernel_size=(9,9),activation=activation_conv,input_shape=(SIZE[0], SIZE[1], CHANNELS), padding="same"))
model.add(MaxPooling2D(pool_size=(3,3), padding="same"))

#model.add(Conv2D(32, kernel_size=(5,5), activation=activation_conv, padding="same"))
#model.add(AveragePooling2D(pool_size=(3,3), padding="same"))

#model.add(Conv2D(32, kernel_size=(4,4), activation=activation_conv, padding="same"))
#model.add(AveragePooling2D(pool_size=(2,2), padding="same"))

#model.add(Conv2D(16, kernel_size=(4,4), activation=activation_conv, padding="same"))
#model.add(MaxPooling2D(pool_size=(2,2), padding="same"))

model.add(Dropout(0.23))

model.add(Flatten())

model.add(Dense(64, activation=activation))
#model.add(Dense(32, activation=activation))
#model.add(Dense(16, activation=activation))

#model.add(Dropout(0.2))

model.add(Dense(num_classes, activation="softmax"))
model.compile(loss=categorical_crossentropy, optimizer="adam", metrics=["accuracy"])#, "precision", "recall"])

In [88]:

# define model structure
# with keras, we can use a model's add() function to add layers to the network one by one
model = Sequential()

# data augmentation (this can also be done beforehand - but don't augment the test dataset!)
model.add(RandomFlip('horizontal'))
model.add(RandomContrast(0.1))
#model.add(RandomBrightness(0.1))
#model.add(RandomRotation(0.2))

# first, we add some convolution layers followed by max pooling
model.add(Conv2D(64, kernel_size=(9, 9), activation=activation_conv, input_shape=(SIZE[0], SIZE[1], CHANNELS), padding='same'))
model.add(MaxPooling2D(pool_size=(4, 4), padding='same'))

model.add(Conv2D(64, (5, 5), activation=activation_conv, padding='same'))
model.add(MaxPooling2D(pool_size=(3, 3), padding='same'))

model.add(Conv2D(32, (3, 3), activation=activation_conv, padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))

# after the convolution layers, we have to flatten the data so it can be fed into fully connected layers
model.add(Flatten())

# dropout layers can drop part of the data during each epoch - this prevents overfitting
model.add(Dropout(0.35))

# add some fully connected layers ("Dense")
for i in range(layer_count - 1):
    model.add(Dense(num_neurons, activation=activation))

model.add(Dense(num_neurons, activation=activation))

# for classification, the last layer has to use the softmax activation function, which gives us probabilities for each category
model.add(Dense(num_classes, activation='softmax', input_shape=num_neurons))

# specify loss function, optimizer and evaluation metrics
# for classification, categorial crossentropy is used as a loss function
# use the adam optimizer unless you have a good reason not to
model.compile(loss=categorical_crossentropy, optimizer="adam", metrics=['accuracy'])


In [89]:

# define model structure
# with keras, we can use a model's add() function to add layers to the network one by one
model = Sequential()

# data augmentation (this can also be done beforehand - but don't augment the test dataset!)
model.add(RandomFlip('horizontal'))
model.add(RandomContrast(0.1))
#model.add(RandomBrightness(0.1))
#model.add(RandomRotation(0.2))

# first, we add some convolution layers followed by max pooling
model.add(Conv2D(64, kernel_size=(9, 9), activation=activation_conv, input_shape=(SIZE[0], SIZE[1], CHANNELS), padding='same'))
model.add(MaxPooling2D(pool_size=(4, 4), padding='same'))

model.add(Conv2D(32, (5, 5), activation=activation_conv, padding='same'))
model.add(MaxPooling2D(pool_size=(3, 3), padding='same'))

model.add(Conv2D(32, (3, 3), activation=activation_conv, padding='same'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))

# dropout layers can drop part of the data during each epoch - this prevents overfitting
model.add(Dropout(0.2))

# after the convolution layers, we have to flatten the data so it can be fed into fully connected layers
model.add(Flatten())

# add some fully connected layers ("Dense")
for i in range(layer_count - 1):
    model.add(Dense(num_neurons, activation=activation))

model.add(Dense(num_neurons, activation=activation))

# for classification, the last layer has to use the softmax activation function, which gives us probabilities for each category
model.add(Dense(num_classes, activation='softmax'))

# specify loss function, optimizer and evaluation metrics
# for classification, categorial crossentropy is used as a loss function
# use the adam optimizer unless you have a good reason not to
model.compile(loss=categorical_crossentropy, optimizer="adam", metrics=['accuracy'])

In [90]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
stop_early = EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(
    X_train,
    train_label,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_data=(X_test, test_label),
    callbacks=[reduce_lr, stop_early]
)

Epoch 1/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.6115 - loss: 1.4584 - val_accuracy: 0.6352 - val_loss: 1.2423 - learning_rate: 0.0010
Epoch 2/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.6062 - loss: 1.3136 - val_accuracy: 0.6352 - val_loss: 1.2209 - learning_rate: 0.0010
Epoch 3/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.6223 - loss: 1.2450 - val_accuracy: 0.6352 - val_loss: 1.2519 - learning_rate: 0.0010
Epoch 4/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.6175 - loss: 1.2044 - val_accuracy: 0.6878 - val_loss: 0.9440 - learning_rate: 0.0010
Epoch 5/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - accuracy: 0.6597 - loss: 0.9708 - val_accuracy: 0.6924 - val_loss: 0.8641 - learning_rate: 0.0010
Epoch 6/50
[1m216/216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [91]:
model.summary()

In [92]:
model.save('gesture_recognition_for_media_control_1.keras')