In [1]:
import os
import mediapipe as mp
import numpy as np
import tensorflow as tf
from mediapipe.framework.formats import landmark_pb2
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

#### Setting up for Mediapipe

In [2]:
drawer = mp.solutions.drawing_utils
VisionRunningMode = mp.tasks.vision.RunningMode

In [3]:
# base options for hand and pose detection models
hand_base_options = python.BaseOptions(model_asset_path="../tasks/hand_landmarker.task")
pose_base_options = python.BaseOptions(model_asset_path="../tasks/pose_landmarker.task")

In [4]:
# options for hand detection
hand_options = vision.HandLandmarkerOptions(
    base_options=hand_base_options,
    num_hands=2,
    min_hand_detection_confidence=0.6,
    min_hand_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# options for pose detection
pose_options = vision.PoseLandmarkerOptions(
    base_options=pose_base_options,
    output_segmentation_masks=True,
    min_pose_detection_confidence=0.6,
    min_pose_presence_confidence=0.6,
    min_tracking_confidence=0.1,
    running_mode=VisionRunningMode.IMAGE,
)

# create detectors
hand_detector = vision.HandLandmarker.create_from_options(hand_options)
pose_detector = vision.PoseLandmarker.create_from_options(pose_options)

#### Extracting Mediapipe Landmark

In [5]:
empty_hand_landmark = np.zeros((2, 21, 3))  # right hand and left hand
empty_pose_landmark = np.zeros(33 * 3)

def to_landmark_data(
    hand_results: vision.HandLandmarkerResult, pose_results: vision.PoseLandmarkerResult
):
    """
    Extract keypoints from pose and hand results for dataset creation.
    """
    pose_landmark = empty_pose_landmark
    hand_landmark = empty_hand_landmark

    if pose_results.pose_world_landmarks:
        pose_landmark = np.array(
            [[lm.x, lm.y, lm.z] for lm in pose_results.pose_world_landmarks[0]]
        ).flatten()

    # if no hand results are available, return the empty hand keypoints
    # and concatenate it with face and pose keypoints
    if not hand_results:
        return np.concatenate([pose_landmark, hand_landmark.flatten()])

    # iterate over the detected hand landmarks
    for index, hlm in enumerate(hand_results.hand_world_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmark[handedness] = np.array([[lm.x, lm.y, lm.z] for lm in hlm])

    return np.concatenate([pose_landmark, hand_landmark.flatten()])

LandmarkList = landmark_pb2.NormalizedLandmarkList  # aliases for landmark types
NormalizedLandmark = landmark_pb2.NormalizedLandmark  # aliases for landmark types


def to_landmark_list(landmarks):
    """
    Create a LandmarkList from a list of landmarks or fill with empty values if no landmarks are provided.
    """
    return LandmarkList(
        landmark=([NormalizedLandmark(x=lm.x, y=lm.y, z=lm.z) for lm in landmarks])
    )


empty_pose_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(33 * 4)]
)

empty_hand_landmarks = to_landmark_list(
    [NormalizedLandmark(x=0.0, y=0.0, z=0.0) for _ in range(21 * 3)]
)


def to_drawing_landmark(hand_results, pose_results):
    """
    Convert pose and hand landmarks to LandmarkList for drawing.
    """
    pose_landmarks = (
        to_landmark_list(pose_results.pose_landmarks[0])
        if pose_results.pose_landmarks
        else empty_pose_landmarks
    )

    hand_landmarks = [empty_hand_landmarks, empty_hand_landmarks]

    if not hand_results:
        return pose_landmarks, None

    # iterate over the detected hand landmarks
    for index, hand_landmark in enumerate(hand_results.hand_landmarks):
        # determine the hand index (0 for right hand, 1 for left hand) using handedness information
        handedness = hand_results.handedness[index][0].index

        # extract the keypoints for the current hand and assign them to the appropriate index
        hand_landmarks[handedness] = to_landmark_list(hand_landmark)

    return pose_landmarks, hand_landmarks


def draw_landmark(image, hand_landmarks, pose_landmarks):
    """
    Draw detected landmarks on the image.
    """
    drawer.draw_landmarks(
        image,
        pose_landmarks,
        mp.solutions.pose.POSE_CONNECTIONS,
        drawer.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=3),
        drawer.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2),
    )

    if not hand_landmarks:
        return

    for hand_landmarks in hand_landmarks:
        drawer.draw_landmarks(
            image,
            hand_landmarks,
            mp.solutions.hands.HAND_CONNECTIONS,
            drawer.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=2),
            drawer.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2),
        )

#### Load Model

In [25]:
# action lables
ACTIONS = np.array([
    "apa", "aku", "kamu"
])

val_path = "../../storage/datasets/raw/"

In [26]:
def load_model(model_version=None):
    model_dir = "../../storage/models/keras"
    prefix = "signdeafspeech_sds_v_"

    if model_version:
        version = f"{prefix}{model_version}.keras"
        ks_file = os.path.join(model_dir, version)

        model = tf.keras.models.load_model(ks_file)

        return version, model

    model_files = os.listdir(model_dir)

    # filter model files by filename prefix
    versions = [file for file in model_files if file.startswith(prefix)]

    # extract version numbers from filenames
    versions = [file.split("_")[-1] for file in versions]

    # convert version numbers to tuples of integers for comparison
    versions_int = [tuple(map(int, v.split(".")[0])) for v in versions]

    # find the index of the latest version
    latest_index = versions_int.index(max(versions_int))

    # load the latest model
    latest_model_path = model_files[latest_index]

    model = tf.keras.models.load_model(os.path.join(model_dir, latest_model_path))

    return latest_model_path, model


v, model = load_model()

In [27]:
f"using model {v}"

'using model signdeafspeech_sds_v_006.keras'

In [28]:
model.summary()

#### Model Prediction

In [29]:
import numpy as np
import time
import concurrent.futures
from moviepy.editor import VideoFileClip

In [30]:
def process_frame(frame, image):
    start_time = time.time()

    # convert into mediapipe numpy type support uint8, uint16, or float32
    image = np.fliplr(image)
    image = image.astype(np.uint8)

    # convert cv image to mediapipe image format before being passed to detectors
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=image)

    hand_results = hand_detector.detect(image=mp_image)
    pose_results = pose_detector.detect(image=mp_image)

    landmarks = to_landmark_data(hand_results, pose_results)

    return frame, landmarks, time.time() - start_time

In [31]:
def predict_from_video(vid):
    clip = VideoFileClip(vid)

    avg_exec_time = []

    predictions = []
    sequences = []

    sentence = []
    threshold = 0.99
    skip_word = "_"

    results = []
    batch_size = 60

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_frame = {
            executor.submit(
                process_frame,
                frame,
                image,
            ): frame
            for frame, image in enumerate(clip.iter_frames(fps=clip.fps))
        }

        for future in concurrent.futures.as_completed(future_to_frame):
            frame, landmarks, exec_time = future.result()
            avg_exec_time.append(exec_time)

            if landmarks is not None:
                results.append((frame, landmarks))

    clip.close()

    start_time = time.time()

    # sort the results by frame number to ensure the order is correct
    results.sort(key=lambda x: x[0])

    for _, landmarks in results:
        sequences.append(landmarks)

        if len(sequences) < batch_size:
            continue

        # ensure correct input shape by adding an extra dimension for batch size
        batch_motion = np.expand_dims(np.stack(sequences[-batch_size:]), axis=0)

        # predict the motion
        result = model.predict(batch_motion, verbose=0)[0]

        # get the predicted class and its confidence
        predicted = np.argmax(result)
        confidence = result[predicted]

        # append to the predictions and accuracies list
        predictions.append(predicted)

        # only keep the last 20 predictions and their accuracies
        # predictions = predictions[-20:]

        predicted_sentence = ACTIONS[predicted]

        # determine most frequent prediction
        most_frequent_prediction = np.bincount(predictions[-10:]).argmax()
        print(most_frequent_prediction, "\t", result[predicted], "\t", predicted_sentence)

        if most_frequent_prediction != predicted:
            continue

        elif confidence < threshold:
            continue

        elif predicted_sentence == skip_word:
            continue

        elif not sentence or predicted_sentence != sentence[-1]:
            # print(confidence, "\t", predicted_sentence, "\t\t", result)
            sentence.append(predicted_sentence)

    end_time = time.time() - start_time

    return (
        sentence,
        len(results),
        {
            "pred_exec_time": end_time,
            "avg_exec_time": avg_exec_time,
            "total_exec_time": sum(avg_exec_time),
        },
    )

#### Video 1

In [44]:
sentence, frame, exec_time = predict_from_video("./demo/kamu.mp4")
true_sentence_1 = ["aku", "apa", "kamu", "bagaimana"]

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

2 	 0.9986619 	 kamu
2 	 0.99899167 	 kamu
2 	 0.99892116 	 kamu
2 	 0.9987853 	 kamu
2 	 0.9985593 	 kamu
2 	 0.9986652 	 kamu
2 	 0.99866736 	 kamu
2 	 0.9985654 	 kamu
2 	 0.998635 	 kamu
2 	 0.99901485 	 kamu
2 	 0.9991365 	 kamu
2 	 0.9992329 	 kamu
2 	 0.9993679 	 kamu
2 	 0.99951303 	 kamu
2 	 0.99961346 	 kamu
2 	 0.99965787 	 kamu
2 	 0.9997441 	 kamu
2 	 0.9998048 	 kamu
2 	 0.9998565 	 kamu
2 	 0.9998597 	 kamu
2 	 0.9998977 	 kamu
2 	 0.99990714 	 kamu
2 	 0.99993265 	 kamu
2 	 0.9999304 	 kamu
2 	 0.9999503 	 kamu
2 	 0.99995184 	 kamu
2 	 0.99996483 	 kamu
2 	 0.9999578 	 kamu
2 	 0.99996936 	 kamu
2 	 0.99996555 	 kamu
2 	 0.9999747 	 kamu
2 	 0.9999691 	 kamu
2 	 0.9999777 	 kamu
2 	 0.9999746 	 kamu
2 	 0.9999826 	 kamu
2 	 0.99997675 	 kamu
2 	 0.999984 	 kamu
2 	 0.9999809 	 kamu
2 	 0.9999875 	 kamu
2 	 0.9999845 	 kamu
2 	 0.9999895 	 kamu
2 	 0.9999882 	 kamu
2 	 0.999992 	 kamu
2 	 0.9999901 	 kamu
2 	 0.99999297 	 kamu
2 	 0.9999924 	 kamu
2 	 0.9999944 	 kamu
2

In [45]:
true_sentence_1 == sentence

False

Video 2

In [40]:
sentence, frame, exec_time = predict_from_video("./demo/aku.mp4")
true_sentence_2 = ["halo", "selamat pagi", "kamu", "bagaimana"]

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

2 	 0.99545836 	 kamu
2 	 0.98623276 	 kamu
2 	 0.9855251 	 kamu
2 	 0.9300357 	 kamu
2 	 0.88162637 	 kamu
2 	 0.6730666 	 kamu
2 	 0.6350211 	 kamu
2 	 0.67226475 	 aku
2 	 0.78413755 	 aku
2 	 0.9173877 	 aku
2 	 0.921875 	 aku
1 	 0.97224975 	 aku
1 	 0.9818364 	 aku
1 	 0.99299 	 aku
1 	 0.993859 	 aku
1 	 0.99736327 	 aku
1 	 0.9981616 	 aku
1 	 0.99913245 	 aku
1 	 0.9991786 	 aku
1 	 0.9996525 	 aku
1 	 0.9997453 	 aku
1 	 0.99986887 	 aku
1 	 0.999884 	 aku
1 	 0.99994016 	 aku
1 	 0.9999553 	 aku
1 	 0.9999757 	 aku
1 	 0.99997723 	 aku
1 	 0.9999883 	 aku
1 	 0.99999106 	 aku
1 	 0.9999945 	 aku
1 	 0.9999951 	 aku
1 	 0.99999726 	 aku
1 	 0.999998 	 aku
1 	 0.9999987 	 aku
1 	 0.9999988 	 aku
1 	 0.99999917 	 aku
1 	 0.9999994 	 aku
1 	 0.9999995 	 aku
1 	 0.9999995 	 aku
1 	 0.99999964 	 aku
1 	 0.99999964 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.99999976 	 aku
1 	 0.9999999 	 

In [41]:
true_sentence_2 == sentence

False

Video 3

In [38]:
sentence, frame, exec_time = predict_from_video("./demo/kamu.mp4")
true_sentence_3 = ["apa", "aku", "kamu"]

print("=" * 50)
print("Total frame calculated:", frame)
print("Total prediction execution:", np.mean(exec_time["pred_exec_time"]))
print("Average execution time per frame:", np.mean(exec_time["avg_exec_time"]))
print("Predicted sentence:", sentence)

2 	 0.9986619 	 kamu
2 	 0.99899167 	 kamu
2 	 0.99892116 	 kamu
2 	 0.9987853 	 kamu
2 	 0.9985593 	 kamu
2 	 0.9986652 	 kamu
2 	 0.99866736 	 kamu
2 	 0.9985654 	 kamu
2 	 0.998635 	 kamu
2 	 0.99901485 	 kamu
2 	 0.9991365 	 kamu
2 	 0.9992329 	 kamu
2 	 0.9993679 	 kamu
2 	 0.99951303 	 kamu
2 	 0.99961346 	 kamu
2 	 0.99965787 	 kamu
2 	 0.9997441 	 kamu
2 	 0.9998048 	 kamu
2 	 0.9998565 	 kamu
2 	 0.9998597 	 kamu
2 	 0.9998977 	 kamu
2 	 0.99990714 	 kamu
2 	 0.99993265 	 kamu
2 	 0.9999304 	 kamu
2 	 0.9999503 	 kamu
2 	 0.99995184 	 kamu
2 	 0.99996483 	 kamu
2 	 0.9999578 	 kamu
2 	 0.99996936 	 kamu
2 	 0.99996555 	 kamu
2 	 0.9999747 	 kamu
2 	 0.9999691 	 kamu
2 	 0.9999777 	 kamu
2 	 0.9999746 	 kamu
2 	 0.9999826 	 kamu
2 	 0.99997675 	 kamu
2 	 0.999984 	 kamu
2 	 0.9999809 	 kamu
2 	 0.9999875 	 kamu
2 	 0.9999845 	 kamu
2 	 0.9999895 	 kamu
2 	 0.9999882 	 kamu
2 	 0.999992 	 kamu
2 	 0.9999901 	 kamu
2 	 0.99999297 	 kamu
2 	 0.9999924 	 kamu
2 	 0.9999944 	 kamu
2

In [39]:
true_sentence_3 == sentence

False