In [1]:
import os, time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.utils import to_categorical

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder, StandardScaler

from scipy.signal import resample

from recognizer import Recogniser, Point

In [2]:
SAMPLE_POINTS = 50
INPUT_PARAMETERS = 2

SCRIPT_DIR = os.path.abspath('') #os.path.dirname(__file__); workaround for jupyter notebook

TRAIN_PATH = os.path.join(SCRIPT_DIR, "dataset/train")
TEST_PATH = os.path.join(SCRIPT_DIR, "dataset/test")

In [3]:
train_data: list[tuple[str, list[tuple]]] = []
test_data: list[tuple[str, list[tuple]]] = []

In [4]:
def get_data(path: str, data_list: list) -> None:
  for root, _, files in os.walk(path):
    if len(files) == 0:
      continue

    for file_name in files:
      if not file_name.endswith(".csv"):
        continue

      class_name = os.path.basename(root)
      file_path = os.path.join(path, class_name, file_name)

      data = pd.read_csv(filepath_or_buffer=file_path, delimiter=",")

      points = data[["x", "y"]]
      points = np.array(points, dtype=float)

      scaler = StandardScaler()
      points = scaler.fit_transform(points)

      points_resampled = resample(points, SAMPLE_POINTS)

      data_list.append((class_name, points_resampled))

In [5]:
get_data(TRAIN_PATH, train_data)
get_data(TEST_PATH, test_data)

In [None]:
print(test_data[0])

In [7]:
def split_data(data: list[tuple[str, list[tuple]]]) -> tuple[np.ndarray, np.ndarray, list[str]]:
  labels = [sample[0] for sample in data]
  print(set(labels))

  encoder = LabelEncoder()
  labels_encoded = encoder.fit_transform(labels)

  print(set(labels_encoded))

  y = to_categorical(labels_encoded)

  print(len(y[0]))

  sequences = [sample[1] for sample in data]
  X = np.array(sequences)

  return X, y, labels

In [8]:
X_train, y_train, labels_train = split_data(train_data)
X_test, y_test, labels_test = split_data(test_data)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

{'question_mark', 'arrow', 'left_curly_brace', 'x', 'rectangle', 'caret', 'check', 'star', 'delete_mark', 'right_sq_bracket', 'triangle', 'left_sq_bracket', 'pigtail', 'right_curly_brace', 'circle', 'v'}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
16
{'question_mark', 'arrow', 'left_curly_brace', 'x', 'rectangle', 'caret', 'check', 'star', 'delete_mark', 'right_sq_bracket', 'triangle', 'left_sq_bracket', 'pigtail', 'right_curly_brace', 'circle', 'v'}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
16
(5120, 50, 2) (160, 50, 2) (5120, 16) (160, 16)


# 1) LSTM Approach

In [None]:
class NN:   

  def init(self, options: dict) -> None:
    self.model = Sequential()

    self.model.add(LSTM(options["lstm_neurons"], input_shape=(SAMPLE_POINTS, INPUT_PARAMETERS)))
    self.model.add(Dense(options["fully_connected_neurons"], activation=options["fully_connected_activation_function"]))
    if options["add_dropout"]:
      self.model.add(Dropout(options["dropout_rate"]))
    self.model.add(Dense(len(set(labels_train)), activation='softmax'))

    self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    self.reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=options["learn_rate"])
    self.stop_early = EarlyStopping(monitor='val_loss', patience=3)

  def train(self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, options: dict) -> None:
    # Train the model
    self.history = self.model.fit(
      X_train,
      y_train,
      epochs=options["epochs"],
      batch_size=options["batch_size"],
      validation_data=(X_test, y_test),
      verbose=1,
      callbacks=[self.reduce_lr, self.stop_early]
    )

  def summarise(self) -> None:
    self.model.summary()

  def plot(self) -> None:
    loss = self.history.history['loss']
    val_loss = self.history.history['val_loss']
    accuracy = self.history.history['accuracy']
    val_accuracy = self.history.history['val_accuracy']

    fig = plt.figure(figsize=(15, 7))
    ax = plt.gca()

    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy (Line), Loss (Dashes)')

    ax.axhline(1, color='gray')

    plt.plot(accuracy, color='blue')
    plt.plot(val_accuracy, color='orange')
    plt.plot(loss, '--', color='blue', alpha=0.5)
    plt.plot(val_loss, '--', color='orange', alpha=0.5)

  def predict(self, X_test: np.ndarray, y_test: np.ndarray, labels_train: list[str]) -> int:
    # let the model make predictions for our training data
    t1 = time.time()
    y_predictions = self.model.predict(X_test)
    t2 = time.time()
    

    # to build a confusion matrix, we have to convert it to classifications
    # this can be done by using the argmax() function to set the probability to 1 and the rest to 0
    y_predictions = np.argmax(y_predictions, axis=1)

    y_test_labels = np.argmax(y_test, axis=1)

    # create and plot confusion matrix
    conf_matrix = confusion_matrix(y_test_labels, y_predictions)

    fig = plt.figure(figsize=(10, 10))

    ConfusionMatrixDisplay(conf_matrix, display_labels=set(labels_train)).plot(ax=plt.gca())

    plt.xticks(rotation=90, ha='center')
    
    return t2-t1

which hyperparameters make sense to be reduced:
- number of lstm units (default: 64)
- dropout rate (default: not included) -> add to reduce learning rate
- fully connected neurons (default: 32)
- sequence length (default: 100%)
- batch size (default: 32)
- epochs (default: 10)

## iteration 1

In [None]:
options = {
  "lstm_neurons": 64,
  "fully_connected_neurons": 32,
  "fully_connected_activation_function": "relu",
  "epochs": 10,
  "batch_size": 32,
  "learn_rate": 0.0001,
  "add_dropout": False,
  "dropout_rate": 0.2
}

nn1 = NN()
nn1.init(options)
nn1.train(X_train, y_train, X_test, y_test, options)
nn1.summarise()
nn1.plot()
iteration_1_time = nn1.predict(X_test, y_test, labels_train)

In [None]:
nn1.model.save("trained_model")

In [None]:
with open("labels_model.txt", "w") as f:
  f.write(str(list(set(labels_train))))
  f.close()

## iteration 2

first, lets reduce the number of neurons from lstm and fully conncected. this should greatly reduce the number of parameters.

In [None]:
options = {
  "lstm_neurons": 32,
  "fully_connected_neurons": 16,
  "fully_connected_activation_function": "relu",
  "epochs": 10,
  "batch_size": 32,
  "learn_rate": 0.0001,
  "add_dropout": False,
  "dropout_rate": 0.2
}

nn2 = NN()
nn2.init(options)
nn2.train(X_train, y_train, X_test, y_test, options)
nn2.summarise()
nn2.plot()
iteration_2_time = nn2.predict(X_test, y_test, labels_train)

## iteration 3

trying to reduce the number of neurons again

In [None]:
options = {
  "lstm_neurons": 16,
  "fully_connected_neurons": 8,
  "fully_connected_activation_function": "relu",
  "epochs": 10,
  "batch_size": 32,
  "learn_rate": 0.0001,
  "add_dropout": False,
  "dropout_rate": 0.2
}

nn3 = NN()
nn3.init(options)
nn3.train(X_train, y_train, X_test, y_test, options)
nn3.summarise()
nn3.plot()
iteration_3_time = nn3.predict(X_test, y_test, labels_train)

## iteration 4

reducing the lstm units to 16 was a bit too much. the accary droped quite some bit.
we again pick the number of neurons from iteration 3 and now change the epoch size/batch size.

In [None]:
options = {
  "lstm_neurons": 32,
  "fully_connected_neurons": 16,
  "fully_connected_activation_function": "relu",
  "epochs": 5,
  "batch_size": 16,
  "learn_rate": 0.0001,
  "add_dropout": False,
  "dropout_rate": 0.2
}

nn4 = NN()
nn4.init(options)
nn4.train(X_train, y_train, X_test, y_test, options)
nn4.summarise()
nn4.plot()
iteration_4_time = nn4.predict(X_test, y_test, labels_train)

## iteration 5

picking the parameters from iteration 4 and reducing learn rate along with adding a dropout layer.

In [None]:
options = {
  "lstm_neurons": 32,
  "fully_connected_neurons": 16,
  "fully_connected_activation_function": "relu",
  "epochs": 5,
  "batch_size": 16,
  "learn_rate": 0.0025,
  "add_dropout": True,
  "dropout_rate": 0.5
}

nn5 = NN()
nn5.init(options)
nn5.train(X_train, y_train, X_test, y_test, options)
nn5.summarise()
nn5.plot()
iteration_5_time = nn5.predict(X_test, y_test, labels_train)

In [None]:
print(iteration_1_time, iteration_2_time, iteration_3_time, iteration_4_time, iteration_5_time)

# 2) 1$ Recogniser Approach

we add the first of each class from the train set and add it as a template to the 1$ recogniser.

In [11]:
rec1 = Recogniser(use_predefined_templates=False)
templates_added = []

for data_point in train_data:
  if not data_point[0] in templates_added:
    points = []
    for p in data_point[1]:
      points.append(Point(p[0], p[1]))

    if not rec1.add_template(data_point[0], points):
      continue
    
    templates_added.append(data_point[0])

In [12]:
print(len(rec1.templates))

32


In [13]:
def prepare_points(data):
  data_points = []
  
  for p in data:
    class_name = p[0]
    class_points = []

    for i in p[1]:
      class_points.append(Point(i[0], i[1]))

    data_points.append((class_name, class_points))
  
  return data_points


In [14]:
points = prepare_points(test_data)
results = []

for p in points:
  t1 = time.time()
  res = rec1.recognise(p[1])
  t2 = time.time()

  results.append({
    "predicted_class": res[0].name,
    "actual_class": p[0],
    "inference_time": t2-t1,
    "accuracy": res[1],
  })

In [15]:
dollar_accuracy = 0
dollar_inference_time = 0

for r in results:
  if r["predicted_class"] == r["actual_class"]:
    dollar_accuracy += 1
  
  dollar_inference_time += r["inference_time"]

dollar_accuracy = dollar_accuracy / len(results)
dollar_inference_time = dollar_inference_time

In [16]:
print(dollar_accuracy, dollar_inference_time)

0.5875 6.5093584060668945


# Report

| Approach | Inference Time (s) | Accuracy (%) | LSTM Neurons | FC Neurons | Epochs | Batch Size | Learn Rate | Dropout |
|--------------|-----------|------------|--------------|-----------|------------|--------------|-----------|------------|
| Iteration 1 (LSTM-NN) | 0.53 | 0.989 | 64 | 32 | 10 | 32 | 0.0001 | False |
| Iteration 2 (LSTM-NN) | 0.43 | 0.994 | 32 | 16 | 10 | 32 | 0.0001 | False |
| Iteration 3 (LSTM-NN) | 0.44 | 0.953 | 16 | 8 | 10 | 32 | 0.0001 | False |
| Iteration 4 (LSTM-NN) | 0.48 | 0.981 | 32 | 16 | 5 | 16 | 0.0001 | False |
| Iteration 5 (LSTM-NN) | 0.58 | 0.730 | 32 | 16 | 5 | 16 | 0.0025 | 0.5 |
| $1 Recogniser  (1 template of each class) | 4.17 | 0.775 | -- | -- | -- | -- | -- | -- |
| $1 Recogniser  (1 template of each class[mirrored]) | 6.51 | 0.588 | -- | -- | -- | -- | -- | -- |


### findings:

- adding more templates to 1$ recogniser might increase accuracy although it would also further increase inference time which is already pretty high.
- 1$ templates have a big limitation e.g. recognition of a circle works good if it was drawn clockwise, but not at all if it was drawn counter-clockwise. this indicates that at least 2 templates for each class are required!
- inference time for lstm-nn is roughly the same for all iterations and we could greatly reduce the number of parameters from 19,760 to 5,280.
- adding a dropout layer and changing min learn rate let accuracy drop to a unusable level (0.73) (requires more iterations and fine-tuning).
- neural network requires more computing power (ram, gpu/cpu) for training and keeping the model in memory whereas $1 recogniser almost requires no memory.
- adding mirrored templates to 1$ recogniser lets the accuracy drop frin 77% to 58%. this could be due to too many templates that need to be checked.


### conclusion:
it shows that one template per class for the 1$ recogniser is insufficient and results in a bad accuracy. adding mirrored templates (so 32 templates alltogether) results in bad accuracy. it also requires a longer time to predict a gesture compared with multiple iterations of lstm-nn. considering that todays computers have dedicated hardware for neural networks (smartphones and macs have dedicated cpus), i see no point why anyone would not choose a lstm-nn - they predict faster, can be optimised reducing hardware requirements, models can be saved and loaded, and accuracy is almost 100\%. the only advantage of 1$ recogniser is that it is light because it only requires the math package from the standard library whereas lstm-nn requires a few dozen packages.
Alltogether, I would choose Iteration 2 because it has almost 100% accuracy and requires the least amount of inference time. For smaller applications where accuracy is not critical, $1 recogniser is good enough; if you train with vertically mirrored templates and few templates. It also removes a lot of startup time, because there is only math package required.