In [6]:
import cv2
import mediapipe as mp
import numpy as np
import os
import time
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.callbacks import EarlyStopping


In [230]:
landmark_data =np.load("landmark_data.npy")
labels = np.load("labels.npy")

In [235]:
landmark_data = landmark_data / np.max(landmark_data)

# Encode labels as integers and convert to categorical
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
labels_categorical = to_categorical(labels_encoded)

In [270]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(landmark_data, labels_categorical, test_size=0.2, random_state=42)

In [5]:
# Build 1D CNN model
model = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(2),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-10-30 00:11:14.241416: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

es = EarlyStopping(patience = 5, restore_best_weights=True)

# Train the model
model.fit(X_train[..., np.newaxis], y_train, epochs=40, batch_size=32, validation_data=(X_test[..., np.newaxis], y_test), callbacks=es)
#model.save("asl_sign_language_model.h5")
save_model(model, 'asl_sign_language_model_tf_2.18.keras')

Epoch 1/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 8ms/step - accuracy: 0.4350 - loss: 1.8901 - val_accuracy: 0.9456 - val_loss: 0.2025
Epoch 2/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.8631 - loss: 0.4336 - val_accuracy: 0.9695 - val_loss: 0.1150
Epoch 3/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.9054 - loss: 0.3007 - val_accuracy: 0.9799 - val_loss: 0.0882
Epoch 4/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.9192 - loss: 0.2589 - val_accuracy: 0.9799 - val_loss: 0.0778
Epoch 5/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 10ms/step - accuracy: 0.9306 - loss: 0.2184 - val_accuracy: 0.9841 - val_loss: 0.0745
Epoch 6/40
[1m1633/1633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 9ms/step - accuracy: 0.9380 - loss: 0.1964 - val_accuracy: 0.9855 - val_loss: 0.0633
Epoch 

In [257]:
# Load the trained model

model = tf.keras.models.load_model("asl_sign_language_model_tf_2.18.keras")

In [169]:
def predict_image(directory):
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1)
    mp_drawing = mp.solutions.drawing_utils

    img = cv2.imread(directory)
    img_rbg =  cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    result = hands.process(img_rbg)

    sequence = []
    sequence_length = 1

    if result.multi_hand_landmarks:
        landmarks = []
        for lm in result.multi_hand_landmarks[0].landmark:
            landmarks.extend([lm.x, lm.y, lm.z])

        # Draw hand landmarks on the frame
        mp_drawing.draw_landmarks(
            img,
            result.multi_hand_landmarks[0],
            mp_hands.HAND_CONNECTIONS
        )

        # Append new frame landmarks to sequence
        sequence.append(landmarks)
        if len(sequence) > sequence_length:
            sequence.pop(0)

        if len(sequence) == sequence_length:
            sequence_input = np.array(sequence).flatten()[np.newaxis, ..., np.newaxis]
            prediction = model.predict(sequence_input)
            predicted_label_index = np.argmax(prediction)
            predicted_label = label_encoder.inverse_transform([predicted_label_index])
            confidence = prediction[0][predicted_label_index]
    return predicted_label


In [170]:
predict_image("/home/diana/code/Koriza274/sign_language_interpreter/basic_models/w.jpg")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


W0000 00:00:1730243970.293026  126915 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1730243970.320027  126915 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


array(['W'], dtype='<U5')

In [266]:
predict_image("/home/diana/code/Koriza274/sign_language_interpreter/raw_data/Test/I/3003.jpg")

W0000 00:00:1730247738.581106  153295 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


W0000 00:00:1730247738.792855  153300 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


array(['I'], dtype='<U5')

In [None]:
def adjust_brightness_contrast(image, brightness=40, contrast=1.0):
    # Convert to float to prevent clipping
    img = image.astype(np.float32)
    # Adjust brightness and contrast
    img = img * contrast + brightness
    # Clip to keep pixel values between 0 and 255 and convert back to uint8
    img = np.clip(img, 0, 255).astype(np.uint8)
    return img

In [267]:
def evaluate_model(test_data_dir):
        mp_hands = mp.solutions.hands
        hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.7)


        labels1 = []
        landmark_data1 = []

        for letter in os.listdir(test_data_dir):

            letter_dir = os.path.join(test_data_dir, letter)
            for i, img_path in enumerate(os.listdir(letter_dir)):

                img = cv2.imread(os.path.join(letter_dir, img_path))
                img = adjust_brightness_contrast(img, 40, 1)

                img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                results = hands.process(img_rgb)


                if results.multi_hand_landmarks:
                    landmarks = []
                    for lm in results.multi_hand_landmarks[0].landmark:
                        landmarks.extend([lm.x, lm.y, lm.z])
                    landmark_data1.append(landmarks)
                    labels1.append(letter)
        landmark_data1 = np.array(landmark_data1)
        labels1 = np.array(labels1)


        # Normalize landmarks between 0 and 1
        landmark_data1 = landmark_data1 / np.max(landmark_data)

        # Encode labels as integers and convert to categorical

        labels_encoded1 = label_encoder.transform(labels1)
        labels_categorical1 = to_categorical(labels_encoded1)
        landmark_data1 = np.reshape(landmark_data1,(-1,63,1))

        return model.evaluate(landmark_data1,labels_categorical1)


In [268]:
evaluate_model("/home/diana/code/Koriza274/sign_language_interpreter/raw_data/Test")

W0000 00:00:1730247772.632845  153542 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1730247772.820606  153542 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.2687 - loss: 15.7795 


[16.914533615112305, 0.30188679695129395]

In [271]:
model.evaluate(X_test,y_test)

[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9932 - loss: 0.0343


[0.029525579884648323, 0.9927241802215576]