In [19]:
import cv2
import numpy as np
import mediapipe as mp
from collections import deque
from hsemotion_onnx.facial_emotions import HSEmotionRecognizer
import time

# Initialize Mediapipe's Face Mesh and HSEmotionRecognizer
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    max_num_faces=1, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5
)

# Load the emotion recognition model
# enet_b2_8.pt
model_name = 'enet_b0_8_best_vgaf'  # Replace with 'enet_b0_8_va_mtl' if needed 
emotion_recognizer = HSEmotionRecognizer(model_name=model_name)

# Helper variables
maxlen = 15
recent_scores = deque(maxlen=maxlen)

# Initialize video capture (0 for webcam or a path for a video file)
cap = cv2.VideoCapture(0)

while cap.isOpened():
    success, frame = cap.read()
    if not success:
        break

    start_time = time.time()

    # Convert the frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_mesh.process(frame_rgb)

    if results.multi_face_landmarks:
        height, width, _ = frame.shape
        for face_landmarks in results.multi_face_landmarks:
            # Get face bounding box from landmarks
            x_min, y_min = width, height
            x_max, y_max = 0, 0
            for lm in face_landmarks.landmark:
                x, y = int(lm.x * width), int(lm.y * height)
                x_min, y_min = min(x_min, x), min(y_min, y)
                x_max, y_max = max(x_max, x), max(y_max, y)

            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(width, x_max), min(height, y_max)

            # Extract face ROI
            face_img = frame_rgb[y_min:y_max, x_min:x_max]
            if face_img.size == 0:
                continue

            # Predict emotions
            emotion, scores = emotion_recognizer.predict_emotions(face_img, logits=True)
            recent_scores.append(scores)

            # Average recent scores for smoother predictions
            avg_scores = np.mean(recent_scores, axis=0)
            emotion_idx = np.argmax(avg_scores)
            predicted_emotion = emotion_recognizer.idx_to_class[emotion_idx]

            # Draw bounding box and emotion label
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(frame, predicted_emotion, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    # Display FPS
    fps = int(1 / (time.time() - start_time))
    cv2.putText(frame, f"FPS: {fps}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Show the frame
    cv2.imshow('Facial Emotions', frame)

    # Break on 'q' key
    if cv2.waitKey(5) & 0xFF == ord('q'):
        break

# Release resources
cap.release()
cv2.destroyAllWindows()
face_mesh.close()


ModuleNotFoundError: No module named 'hsemotion_onnx'

In [15]:
!pip uninstall mediapipe

^C


In [9]:
!pip install mediapipe==0.9.0

ERROR: Could not find a version that satisfies the requirement mediapipe==0.9.0 (from versions: 0.10.13, 0.10.14, 0.10.18, 0.10.20)
ERROR: No matching distribution found for mediapipe==0.9.0


Ho avuto grossi problemi di compatibilità con mediapipe, quindi ho deciso di non utilizzarlo completamente perché alla fine veniva usato solo per la face detection che adesso faccio con haar cascade.

Here's the updated implementation using Haar Cascade for face detection and hsemotion_onnx for emotion recognition:

Key Points:
Face Detection with Haar Cascade:

The Haar Cascade is used for detecting faces. It's simple and lightweight, which avoids compatibility issues.
Emotion Recognition:

Once a face is detected, it is passed to the HSEmotionRecognizer for emotion classification.
Handling Multiple Faces:

If multiple faces are detected, the script processes each face separately and annotates them with detected emotions.
Recent Scores Buffer:

A deque is used to average emotion scores over the last maxlen frames for smoother emotion predictions.
Requirements:
Ensure you have the following installed:

OpenCV: pip install opencv-python
HSEmotion ONNX: Make sure the model is properly installed and accessible.
Run this script, and it should work seamlessly without requiring Mediapipe. Let me know if you need further adjustments!


In [23]:
import cv2
import numpy as np
from collections import deque
from hsemotion_onnx.facial_emotions import HSEmotionRecognizer

# Initialize Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Initialize the emotion recognizer
# enet_b2_8.pt
model_name = 'enet_b0_8_best_vgaf'
emotion_recognizer = HSEmotionRecognizer(model_name=model_name)

# Define a buffer for recent emotion scores
maxlen = 15
recent_scores = deque(maxlen=maxlen)

def process_video(video_file=0):
    # Open video capture
    cap = cv2.VideoCapture(video_file)
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break

        # Convert to grayscale for Haar Cascade
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        for (x, y, w, h) in faces:
            # Crop and preprocess the face
            face_img = frame[y:y+h, x:x+w]
            if np.prod(face_img.shape) == 0:
                continue

            # Predict emotions
            try:
                emotion, scores = emotion_recognizer.predict_emotions(face_img, logits=True)
                recent_scores.append(scores)

                # Compute the average score across recent frames
                scores = np.mean(recent_scores, axis=0)
                emotion_idx = np.argmax(scores)
                emotion_label = emotion_recognizer.idx_to_class[emotion_idx]

                # Display the detected emotion on the video
                cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
                cv2.putText(frame, emotion_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            except Exception as e:
                print(f"Error processing face: {e}")

        # Show the video frame with emotion labels
        cv2.imshow('Facial Emotions', frame)

        # Break on 'ESC' key press
        if cv2.waitKey(1) & 0xFF == 27:
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    process_video()


Downloading enet_b0_8_best_vgaf from https://github.com/HSE-asavchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/onnx/enet_b0_8_best_vgaf.onnx?raw=true


In [35]:
import cv2
import numpy as np
from collections import deque
from hsemotion_onnx.facial_emotions import HSEmotionRecognizer
import time

# Initialize Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

# Initialize the emotion recognizer
# enet_b2_8(.pt)
# enet_b0_8_best_vgaf
model_name = 'enet_b0_8_best_vgaf'
emotion_recognizer = HSEmotionRecognizer(model_name=model_name)

# Define a buffer for recent emotion scores
maxlen = 15
recent_scores = deque(maxlen=maxlen)

def process_video(video_file=0):
    # Open video capture
    cap = cv2.VideoCapture(video_file)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    # Variables for FPS calculation
    prev_time = time.time()
    fps = 0

    while True:
        success, frame = cap.read()
        if not success:
            break

        # Convert to grayscale for Haar Cascade
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        for (x, y, w, h) in faces:
            # Crop and preprocess the face
            face_img = frame[y:y+h, x:x+w]
            if np.prod(face_img.shape) == 0:
                continue

            # Predict emotions
            try:
                emotion, scores = emotion_recognizer.predict_emotions(face_img, logits=True)
                recent_scores.append(scores)

                # Compute the average score across recent frames
                scores = np.mean(recent_scores, axis=0)
                emotion_idx = np.argmax(scores)
                emotion_label = emotion_recognizer.idx_to_class[emotion_idx]

                # Display the detected emotion on the video
                cv2.rectangle(frame, (x, y), (x+w, y+h), (255, 0, 0), 2)
                cv2.putText(frame, emotion_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            except Exception as e:
                print(f"Error processing face: {e}")

        # Calculate and display FPS
        curr_time = time.time()
        fps = 1 / (curr_time - prev_time)
        prev_time = curr_time
        fps_text = f"FPS: {fps:.2f}"
        cv2.putText(frame, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)

        # Show the video frame with emotion labels
        cv2.imshow('Facial Emotions', frame)

        # Break on 'q' key press or window close
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if cv2.getWindowProperty('Facial Emotions', cv2.WND_PROP_VISIBLE) < 1:
            break

    # Release resources
    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    process_video()
