In [None]:
#Minimum viable model so far
#Add relu to it
#Add softmax activation to final layer
#Mistake was not converting to float32
import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()

        self.lstm1 = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.identity = nn.Identity()
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.identity(x)
        x, _ = self.lstm2(x)
        x = self.identity(x)
        x, _ = self.lstm3(x)
        x = self.identity(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)

        return x

model = LSTMModel(input_dim=258, hidden_dim=64, output_dim=actions.shape[0])
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# X born from the keypoints
# y born from the labels (perform one-hot encoding)
# Split the dataset with sci-kit learn before this, categorical split
X_train = X_train.to(torch.float32)
y_train = y_train.to(torch.long)

for epoch in range(2000):
    # Forward pass
    y_pred = model(X_train)

    # Compute loss and accuracy
    loss = loss_fn(y_pred, y_train)
    _, prediction = y_pred.max(dim=1)
    accuracy = (prediction == y_train).float().mean()

    # Print loss and accuracy
    print('Epoch: ', epoch, 'Loss: ', loss.item(), 'Accuracy: ', accuracy.item()*100, '%')

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
torch.save(model, 'model.pth')

In [None]:
model = torch.load('model.pth')

In [None]:
# Testing the model 

X_test = X_test.to(torch.float32)
y_test = y_test.to(torch.long)


with torch.no_grad():
    y_pred = model(X_test)
    loss = loss_fn(y_pred, y_test)
    _, prediction = y_pred.max(dim=1)
    accuracy = (prediction == y_test).float().mean()
    print('Test loss: ', loss.item(), 'Test accuracy: ', accuracy.item()*100, '%')
    
# NOTE
# Testing the model after training it produces decent results, but testing after loading the model gives low accuracy
# Find the issue with that or train the model every time (not optimal)
# Issue possibly because of loading the model itself vs state dictionary of the model

In [None]:
# Confusion matrix (primitive)

from sklearn.metrics import confusion_matrix



# Find the class with the highest probability for each sample
_, predictions_np = y_pred.max(dim=1)

# Convert the predictions to NumPy arrays
predictions_np = predictions_np.numpy()

# Convert the test set labels to integers
y_test_int = y_test_np.argmax(axis=1)

# Convert the predictions to integers
predictions_int = predictions_np.argmax(axis=1)

# Calculate the confusion matrix
cm = confusion_matrix(y_test_int, predictions_int)

print(cm)

In [None]:
# Once confusion matrix is finalized, create heatmap with seaborn

import seaborn as sns

sns.heatmap(cm, annot=True)

In [None]:
# Testing the model in real time with OpenCV and Mediapipe
# Doesn't work yet 

import cv2
import numpy as np
import torch
import mediapipe.python.solutions.holistic as mp_holistic


# Map the indices of the predictions to the corresponding action names
# Signs is array/list of classes (words)
sign_names = signs.tolist()


# Initialize the camera capture
cap = cv2.VideoCapture(0)

while True:
  # Capture frame from the camera
  ret, frame = cap.read()

  # Preprocess the frame to extract the features
  features = preprocess_frame(frame)

  # Convert the features to a tensor and pass them through the model
  features = torch.tensor(features, dtype=torch.float32)
  prediction = model(features)
  
  # Get the index of the most likely action
  _, index = prediction.max(dim=1)
  index = index.item()
  
  # Map the index to the corresponding action name
  action_name = sign_names[index]
  
  # Display the action name on the frame
  cv2.putText(frame, action_name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
  
  # Show the frame
  cv2.imshow('Frame', frame)
  
  # Break out of the loop if the user presses 'q'
  if cv2.waitKey(1) & 0xFF == ord('q'):
    break

# Release the camera and close the window
cap.release()
cv2.destroyAllWindows()