In [1]:
import os
import numpy as np
import cv2
import mediapipe as mp
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout,Bidirectional
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping, LearningRateScheduler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import random
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam  # Add this import
# Initialize mediapipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils


In [2]:
# Initialize Mediapipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Function to process images with Mediapipe
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

# Function to extract keypoints and normalize them
def extract_keypoints(results, image):
    height, width, _ = image.shape  # Get image dimensions
    pose = np.array([[res.x * width, res.y * height, res.z, res.visibility] for res in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros((33, 4))
    face = np.array([[res.x * width, res.y * height, res.z] for res in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros((468, 3))
    lh = np.array([[res.x * width, res.y * height, res.z] for res in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros((21, 3))
    rh = np.array([[res.x * width, res.y * height, res.z] for res in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros((21, 3))
    return np.concatenate([pose.flatten(), face.flatten(), lh.flatten(), rh.flatten()])

# Draw styled landmarks
def draw_styled_landmarks(image, results):
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(
            image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(80, 22, 10), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(80, 44, 121), thickness=2, circle_radius=2)
        )
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
        )
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(
            image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4),
            mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
        )


In [3]:
# Data Augmentation Function
def augment_data(image, results):
    # Random flipping
    if random.random() > 0.5:
        image = cv2.flip(image, 1)  # Flip horizontally
    return image, results


In [5]:
# Define paths and actions
DATA_PATH = os.path.join('MP_Data')
actions = np.array(['Hello', 'Thanks', 'I like it'])  # Define actions
no_sequences = 20
sequence_length = 30

# Create necessary directories
for action in actions:
    action_path = os.path.join(DATA_PATH, action)
    os.makedirs(action_path, exist_ok=True)


In [47]:

# Open video capture
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    for action in actions:
        for sequence in range(no_sequences):
            for frame_num in range(sequence_length):
                ret, frame = cap.read()

                if not ret:
                    print("Failed to grab frame")
                    break

                # Process image and make detections
                image, results = mediapipe_detection(frame, holistic)

                # Apply data augmentation (if needed)
                image, results = augment_data(image, results)

                # Draw landmarks
                draw_styled_landmarks(image, results)

                # Extract and save keypoints
                keypoints = extract_keypoints(results, frame)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), f'{frame_num}.npy')
                os.makedirs(os.path.dirname(npy_path), exist_ok=True)
                np.save(npy_path, keypoints)

                if frame_num == 0:
                    cv2.putText(image, f'STARTING COLLECTION: {action}', (120, 200), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(500)
                else:
                    cv2.putText(image, f'Collecting frames for {action}, Video {sequence}', (15, 12), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.imshow('OpenCV Feed', image)

                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break


: 

In [12]:
# Prepare labels and sequences
label_map = {label: num for num, label in enumerate(actions)}
sequences, labels = [], []

# Load sequences and labels from the data folder
for action in actions:
    action_path = os.path.join(DATA_PATH, action)
    for sequence in np.array(os.listdir(action_path)).astype(int):
        window = []
        for frame_num in range(sequence_length):
            # Load keypoint data for each frame
            res = np.load(os.path.join(action_path, str(sequence), f"{frame_num}.npy"))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

X = np.array(sequences)
y = to_categorical(labels).astype(int)  # One-hot encode the labels

# Split data into training and test sets (stratified splitting to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, stratify=y, random_state=42)


In [13]:
# Define model
model = Sequential()
model.add(LSTM(512, return_sequences=True, activation='relu', input_shape=(sequence_length, 1662)))
model.add(Dropout(0.3))
model.add(LSTM(256, return_sequences=True, activation='relu'))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=False, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))  # Added dropout to avoid overfitting
model.add(Dense(64, activation='relu'))
model.add(Dense(actions.shape[0], activation='softmax'))  # Number of actions (classes)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])


  super().__init__(**kwargs)


In [14]:
# Callbacks
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Learning rate scheduler to reduce the learning rate after 10 epochs
def lr_schedule(epoch, lr):
    return lr * 0.9 if epoch > 10 else lr

lr_callback = LearningRateScheduler(lr_schedule)


In [15]:
model.fit(
    X_train, y_train,
    epochs=2000,
    validation_data=(X_test, y_test),
    callbacks=[tb_callback, early_stopping, lr_callback]
)

Epoch 1/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 4s/step - categorical_accuracy: 0.3681 - loss: 1365.8162 - val_categorical_accuracy: 0.3333 - val_loss: 2181.9324 - learning_rate: 0.0010
Epoch 2/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - categorical_accuracy: 0.3706 - loss: 1521.3599 - val_categorical_accuracy: 0.0000e+00 - val_loss: 6613.2969 - learning_rate: 0.0010
Epoch 3/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - categorical_accuracy: 0.2692 - loss: 3260.8313 - val_categorical_accuracy: 0.3333 - val_loss: 6725.5728 - learning_rate: 0.0010
Epoch 4/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - categorical_accuracy: 0.3342 - loss: 7255.4409 - val_categorical_accuracy: 0.6667 - val_loss: 1186.7737 - learning_rate: 0.0010
Epoch 5/2000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - categorical_accuracy: 0.4031 - loss: 8077.8994 - val_categ

<keras.src.callbacks.history.History at 0x26a6baf2810>

In [16]:
# Save model
MODEL_SAVE_PATH = os.path.join('saved_models', 'action_recognition_model.h5')
os.makedirs('saved_models', exist_ok=True)
model.save(MODEL_SAVE_PATH)
print(f"Model saved at: {MODEL_SAVE_PATH}")






Model saved at: saved_models\action_recognition_model.h5


In [None]:
# Load model (if needed)
 model = load_model(MODEL_SAVE_PATH)
 print("Model loaded successfully!")

In [17]:
# Test predictions
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1)
yhat = np.argmax(yhat, axis=1)

# Confusion Matrix and Accuracy
print("Confusion Matrix:")
print(confusion_matrix(ytrue, yhat))
print(f"Accuracy: {accuracy_score(ytrue, yhat)}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [0 0 1]]
Accuracy: 1.0


In [20]:
# Prediction logic and visualization
threshold = 0.7  # Confidence threshold for predictions
predictions = []
sentence = []

# Start live predictions
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            print("Failed to grab frame")
            break

        image, results = mediapipe_detection(frame, holistic)
        draw_styled_landmarks(image, results)

        # Extract keypoints
        keypoints = extract_keypoints(results, frame)
        predictions.append(keypoints)
        predictions = predictions[-30:]  # Keep only the last 30 frames

        if len(predictions) == 30:  # Only predict once enough frames are collected
            res = model.predict(np.expand_dims(predictions, axis=0))[0]
            predicted_action = actions[np.argmax(res)]
            confidence = res[np.argmax(res)]

            if confidence > threshold:
                # Update sentence only if the predicted action is different from the last one
                if len(sentence) == 0 or predicted_action != sentence[-1]:
                    sentence.append(predicted_action)

        if len(sentence) > 5:  # Limit sentence length to 5 actions
            sentence = sentence[-5:]

        # Display sentence on the screen
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('OpenCV Feed', image)

        # Break the loop if 'q' is pressed
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 213ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 271ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [18]:
print("labels in dataset : ",np.unique(labels))
print("labels in prediction : ",np.unique(yhat))

labels in dataset :  [0 1 2]
labels in prediction :  [0 1 2]
