Putting the two together. Being able to see both hands and faces and making predictions on them.

In [2]:
import cv2
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt
from keras.models import load_model
from keras.preprocessing.image import img_to_array


# Load the pre-trained models
gesture_classifier = load_model('rock_paper_scissors_best_model.keras')
emotion_classifier = load_model('emotional_model_best.keras')

gesture_labels = ['Rock', 'Paper', 'Scissors']
emotion_labels = ['anger', 'happy']

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, 
                       max_num_hands=2, 
                       min_detection_confidence=0.5, 
                       min_tracking_confidence=0.5
                       )

# Load pre-trained face detector (Haar cascade)
face_classifier = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert the frame to RGB and grayscale better for facial detection
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Process the frame for hand detection
    hand_results = hands.process(frame_rgb)
    
    # Detect FACES in the frame
    faces = face_classifier.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)

    if hand_results.multi_hand_landmarks:
        for hand_landmarks in hand_results.multi_hand_landmarks:
            # Get bounding box of the hand
            h, w, c = frame.shape
            bbox = []
            for landmark in hand_landmarks.landmark:
                bbox.append((int(landmark.x * w), int(landmark.y * h)))
            bbox = np.array(bbox)
            x_min, y_min = np.min(bbox, axis=0)
            x_max, y_max = np.max(bbox, axis=0)

            # Extract HAND region of intrest (ROI) and preprocess for the classifier
            hand_roi = frame[y_min:y_max, x_min:x_max]
            if hand_roi.size > 0:
                hand_roi = cv2.resize(hand_roi, (224, 224))  # Resize to the size the model needs
                hand_roi = cv2.cvtColor(hand_roi, cv2.COLOR_BGR2RGB)
                hand_roi = img_to_array(hand_roi)
                hand_roi = np.expand_dims(hand_roi, axis=0) / 255.0  # Normalize

                # Predict the HAND gesture
                prediction = gesture_classifier.predict(hand_roi)[0]
                label = gesture_labels[np.argmax(prediction)]

                # Draw the bounding box and label on the frame
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                cv2.putText(frame, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_TRIPLEX, 1, (0, 255, 0), 2)

            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
    # Extract FACE region of intrest (ROI) and preprocess for the classifier
    for (x, y, w, h) in faces:
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 255), 2)
        face_roi = frame[y:y+h, x:x+w]
        if face_roi.size > 0:
            face_roi = cv2.resize(face_roi, (224, 224))
            face_roi = cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB)
            face_roi = img_to_array(face_roi)
            face_roi = np.expand_dims(face_roi, axis=0) / 255.0

            # Predict the EMOTION
            prediction = emotion_classifier.predict(face_roi)[0]
            label = emotion_labels[np.argmax(prediction)]
            cv2.putText(frame, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow('Gesture and Emotion Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms