In [1]:
from scipy import stats
from matplotlib import pyplot as plt 
import numpy as np
import cv2
import mediapipe as mp
import tensorflow as tf
import torch
import torch.nn as nn
import os

In [None]:
%%script false
# Check if webcam functionality is working
import cv2

cam = cv2.VideoCapture(1)

while True:
    check, frame = cam.read()

    cv2.imshow('video', frame)

    if cv2.waitKey(1) == ord('q'):
        break

cam.release()
cv2.destroyAllWindows()

In [2]:
# Check if GPU is available
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))
import torch; print(torch.cuda.is_available())

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
True


##### Mediapipie Initialization

In [5]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

def mediapipe_detection(image,model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Color conversion from BGR to RGB
    image.flags.writeable = False                   # Image is no longer writeable
    results = model.process(image)                  # Make prediction
    image.flags.writeable = True                    # Image is no longer writeable
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)   # Color conversion RGB to BGR
    return image, results

def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)  # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)    # Draw right connections

def draw_styled_landmarks(image,results):
    # Draw pose connection
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(0,0,255), thickness=5,circle_radius=5),
                              mp_drawing.DrawingSpec(color=(80,110,10), thickness=5,circle_radius=5)
                              )
    # Draw left hand connection
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(255, 255, 0), thickness=5,circle_radius=5),
                              mp_drawing.DrawingSpec(color=(255, 255, 0), thickness=5,circle_radius=5)
                              )
    # Draw right hand connection
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                              mp_drawing.DrawingSpec(color=(255, 255, 0), thickness=5,circle_radius=5),
                              mp_drawing.DrawingSpec(color=(255, 255, 0), thickness=5,circle_radius=5)
                              )
    
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose,lh,rh])

In [6]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

##### Model Initialization

In [8]:
video_directory = 'TRAIN_5'

sum = 0

gesture_folder = np.array(os.listdir(video_directory))
for gestures in gesture_folder:
    gesture = []

    for fname in os.listdir(os.path.join(video_directory, gestures)):
        path = os.path.join(video_directory, gestures, fname)
        if os.path.isdir(path):
            gesture.append(fname)

    sum += len(gesture) 
    # print(gestures, end =" : ")        
    # print(len(gesture))

print("Total gestures: ", len(gesture_folder), "; Total videos: ", sum)

Total gestures:  107 ; Total videos:  7098


In [9]:
gestures = np.array(gesture_folder)
gestures

array(['abang', 'ada', 'adik_lelaki', 'adik_perempuan', 'air', 'ambil',
       'anak', 'apa', 'arah', 'awak', 'ayah', 'baca', 'bagaimana', 'baik',
       'baik2', 'bas', 'bawa', 'belajar', 'beli', 'beli2', 'berapa',
       'berjalan', 'berlari', 'bila', 'bola', 'boleh', 'bomba', 'buang',
       'buat', 'cuaca', 'curi', 'dapat', 'dari', 'datuk', 'duit', 'esok',
       'gambar', 'hari', 'hilang', 'hospital', 'hujan', 'ibu', 'jahat',
       'jalan', 'jam', 'jangan', 'jumpa', 'kacau', 'kafeteria', 'kakak',
       'kedai', 'keluarga', 'kereta', 'kereta_api', 'khabar_baik',
       'lelaki', 'lupa', 'main', 'makan', 'mana', 'marah', 'marah2',
       'mari', 'mari2', 'masa', 'masalah', 'menyakitkan', 'minum',
       'mohon', 'mohon2', 'nama', 'nasi', 'nasi_lemak', 'nenek', 'panas',
       'panas2', 'pandai', 'pandai2', 'payung', 'pen', 'pensil',
       'perempuan', 'pergi', 'pergi2', 'perlahan', 'perlahan2', 'polis',
       'pukul', 'ribut', 'sampai', 'saudara', 'saya', 'sejuk', 'sekolah',
   

In [10]:
label_map = {label: num for num, label in enumerate(gesture_folder)}
len(label_map)

107

In [14]:
# %%script false
# Define your custom LSTM model
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(CustomLSTM, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, 32)
        self.output_layer = nn.Linear(32, num_classes)
        
    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x = torch.relu(self.fc1(x[:, -1, :]))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = self.output_layer(x)
        return x
    
# Instantiate the model
input_size = 258
hidden_size = 64
num_classes = len(label_map)
model = CustomLSTM(input_size, hidden_size, num_classes)

# Load the saved model state dictionary
model_filename = 'models/lstm_model_train_5_0.88.pth'
loaded_model_state_dict = torch.load(model_filename)

# Load the state dictionary into the model
model.load_state_dict(loaded_model_state_dict)
model.eval()  # Set the model to evaluation mode

  loaded_model_state_dict = torch.load(model_filename)


CustomLSTM(
  (lstm1): LSTM(258, 64, batch_first=True)
  (lstm2): LSTM(64, 64, batch_first=True)
  (lstm3): LSTM(64, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=32, bias=True)
  (fc5): Linear(in_features=32, out_features=32, bias=True)
  (output_layer): Linear(in_features=32, out_features=107, bias=True)
)

In [11]:
%%script false
# load transformer model
# Define Transformer model for classification
class CustomTransformer(nn.Module):
    def __init__(self, input_size, num_classes, d_model=64, nhead=8, num_encoder_layers=3, dim_feedforward=128, dropout=0.1):
        super(CustomTransformer, self).__init__()
        self.input_projection = nn.Linear(input_size, d_model)  # Project input to model dimension
        self.positional_encoding = nn.Parameter(torch.zeros(1, 5000, d_model))  # Positional Encoding
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,  # Ensure batch is first dim
            ),
            num_layers=num_encoder_layers,
        )
        self.fc = nn.Linear(d_model, num_classes)  # Final classification layer

    def forward(self, x):
        # Project input to d_model dimension
        x = self.input_projection(x)
        
        # Add positional encoding
        seq_len = x.size(1)
        x = x + self.positional_encoding[:, :seq_len, :]
        
        # Pass through Transformer Encoder
        x = self.encoder(x)
        
        # Take the last token's representation for classification
        x = x[:, -1, :]
        x = self.fc(x)
        return x
    
# Model, loss, and optimizer
input_size = 258
num_classes = len(label_map)
model = CustomTransformer(input_size=input_size, num_classes=num_classes)

# Load the saved model state dictionary
model_filename = 'models/transformer_model_train_2_0.96.pth'
loaded_model_state_dict = torch.load(model_filename)

# Load the state dictionary into the model
model.load_state_dict(loaded_model_state_dict)
model.eval()  # Set the model to evaluation mode

Couldn't find program: 'false'


In [23]:
%%script false
# 1. New detection variables
sequence = []
sentence = []
predictions = []
threshold = 0.3
frame_count = 0

cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        frame_count += 1

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_styled_landmarks(image, results)
        
        # 2. Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model(torch.tensor(np.expand_dims(sequence, axis=0),dtype=torch.float32))
            print(gestures[res.argmax(dim=1)])
            predictions.append(res.argmax(dim=1))

            # prediction logic
            if np.unique(predictions[-10:])[0]==res.argmax(dim=1): 
                if res.argmax(dim=1) > threshold: 
                    
                    if len(sentence) > 0: 
                        if gestures[res.argmax(dim=1)] != sentence[-1]:
                            sentence.append(gestures[res.argmax(dim=1)])
                    else:
                        sentence.append(gestures[res.argmax(dim=1)])

            if len(sentence) > 5: 
                sentence = sentence[-5:]

        if frame_count >= 30:
            frame_count = 0

        # Display frame count
        cv2.putText(image, f'Frame: {frame_count}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
            
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (1,30), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

Couldn't find program: 'false'


In [25]:
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

# Visualization function
def plot_sequence(sequence):
    seq_array = np.array(sequence)
    fig, ax = plt.subplots(figsize=(4, 2))
    ax.imshow(seq_array.T, aspect='auto', interpolation='nearest', cmap='viridis')
    ax.set_title("Sequence Heatmap")
    ax.set_xlabel("Frame")
    ax.set_ylabel("Keypoints")
    plt.tight_layout()
    
    canvas = FigureCanvas(fig)
    canvas.draw()
    img = np.frombuffer(canvas.tostring_rgb(), dtype=np.uint8)
    img = img.reshape(canvas.get_width_height()[::-1] + (3,))
    plt.close(fig)
    return img

In [26]:
# Initialize variables
sequence = []
sentence = []
predictions = []
threshold = 0.3
frame_count = 0

mp_holistic = mp.solutions.holistic
cap = cv2.VideoCapture(0)

# Mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        # Check if hand landmarks are visible
        if results.left_hand_landmarks or results.right_hand_landmarks:
            frame_count += 1
            
            # Draw landmarks
            draw_styled_landmarks(image, results)
            
            # Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-30:]
            
            if len(sequence) == 30:
                res = model(torch.tensor(np.expand_dims(sequence, axis=0), dtype=torch.float32))
                print(gestures[res.argmax(dim=1)])
                predictions.append(res.argmax(dim=1))

                # Prediction stability logic
                if np.unique(predictions[-10:])[0] == res.argmax(dim=1): 
                    if res.max() > threshold: 
                        
                        if len(sentence) > 0: 
                            if gestures[res.argmax(dim=1)] != sentence[-1]:
                                sentence.append(gestures[res.argmax(dim=1)])
                        else:
                            sentence.append(gestures[res.argmax(dim=1)])

                if len(sentence) > 5: 
                    sentence = sentence[-5:]

            if frame_count >= 30:
                frame_count = 0

            # Display frame count
            cv2.putText(image, f'Frame: {frame_count}', (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
            
            # Heatmap visualization
            if len(sequence) > 0:
                heatmap_image = plot_sequence(sequence[-30:])  # Get the last 30 frames
                heatmap_image = cv2.cvtColor(heatmap_image, cv2.COLOR_RGB2BGR)  # Convert to BGR
                h, w, _ = heatmap_image.shape
                # Overlay the heatmap in the top-left corner
                image[0:h, 0:w, :] = heatmap_image
                
            # Display predictions and sequence
            cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
            cv2.putText(image, ' '.join(sentence), (10, 30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2, cv2.LINE_AA)
        
        else:
            # Display message if no hands are detected
            cv2.putText(image, "No hand detected", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


  img = np.frombuffer(canvas.tostring_rgb(), dtype=np.uint8)


nasi_lemak
nasi_lemak


  ar = np.asanyarray(ar)
  ar = np.asanyarray(ar)


nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
anak
anak
anak
anak
anak
anak
anak
anak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
nasi_lemak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
anak
curi
anak
anak
curi
anak
curi
curi
curi
curi
curi
anak
nasi_lemak
nasi_lema