
Vision Transformer (ViT) for Facial Expression Recognition


### **Explanation**:
1. **Model and Processor**:
   - The model (`AutoModelForImageClassification`) and processor (`AutoImageProcessor`) are loaded from Hugging Face. 
   - The processor handles image preprocessing like resizing, normalization, and tensor conversion.
   
2. **Face Detection**:
   - OpenCV's Haar cascade is used to detect faces in the video frame.
   - Each detected face is cropped from the frame (`face_roi`) for emotion classification.

3. **Emotion Analysis**:
   - The face is converted into a PIL image and passed through the processor for preprocessing.
   - The preprocessed image is fed into the model to get the logits.
   - Softmax is applied to convert logits to probabilities, and the class with the highest probability is selected as the predicted emotion.

4. **Visualization**:
   - Bounding boxes are drawn around the detected faces.
   - The predicted emotion is displayed as a label near the bounding box.



In [None]:
import cv2
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torchvision.transforms import functional as F
from PIL import Image

# Load model and processor
processor = AutoImageProcessor.from_pretrained("trpakov/vit-face-expression")
model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression")

# Load OpenCV's Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Start video capture
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    for (x, y, w, h) in faces:
        # Extract face ROI
        face_roi = frame[y:y + h, x:x + w]

        try:
            # Convert to PIL Image and preprocess
            face_pil = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
            inputs = processor(face_pil, return_tensors="pt")

            # Run inference
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            predicted_class = probs.argmax().item()
            predicted_label = model.config.id2label[predicted_class]

            # Draw bounding box and label
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, predicted_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        except Exception as e:
            print(f"Error analyzing face: {e}")

    # Display the video feed
    cv2.imshow("Emotion Detection", frame)

    # Exit on 'q' key or if the window is closed
    key = cv2.waitKey(1)
    if key == ord('q') or cv2.getWindowProperty("Emotion Detection", cv2.WND_PROP_VISIBLE) < 1:
        break

cap.release()
cv2.destroyAllWindows()


Displaying the current **frames per second (FPS)** on the preview window: to calculate FPS in OpenCV, you can use a simple approach by measuring the time taken to process each frame. By keeping track of the time before and after processing each frame, you can calculate the FPS and display it on the video stream.


### **Changes for FPS Calculation**:
1. **Track Time for FPS**:
   - We use `time.time()` to get the current time before and after each frame is processed.
   - The FPS is calculated as the inverse of the difference between the current time and the previous time (`1 / (curr_time - prev_time)`).

2. **Display FPS**:
   - We use `cv2.putText()` to draw the FPS value on the frame. The text is placed in the top-left corner of the window (`(10, 30)`).

3. **Frame Processing**:
   - For each frame, we calculate the FPS and update the `prev_time` for the next frame.


- FPS is calculated based on the time elapsed between frames. The faster the frames are processed, the higher the FPS will be. This gives you a real-time indication of how many frames are processed per second.
  


In [None]:
import cv2
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import time

# Load model and processor
processor = AutoImageProcessor.from_pretrained("trpakov/vit-face-expression")
model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression")

# Load OpenCV's Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Start video capture
cap = cv2.VideoCapture(0)

# FPS calculation variables
prev_time = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Get the current time
    curr_time = time.time()

    # Calculate FPS (frames per second)
    fps = 1 / (curr_time - prev_time)
    prev_time = curr_time

    # Convert frame to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

    for (x, y, w, h) in faces:
        # Extract face ROI
        face_roi = frame[y:y + h, x:x + w]

        try:
            # Convert to PIL Image and preprocess
            face_pil = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
            inputs = processor(face_pil, return_tensors="pt")

            # Run inference
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            predicted_class = probs.argmax().item()
            predicted_label = model.config.id2label[predicted_class]

            # Draw bounding box and label
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(frame, predicted_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        except Exception as e:
            print(f"Error analyzing face: {e}")

    # Display FPS on the frame
    cv2.putText(frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the video feed with detected emotions
    cv2.imshow("Emotion Detection", frame)

    # Exit on 'q' key or if the window is closed
    key = cv2.waitKey(1)
    if key == ord('q') or cv2.getWindowProperty("Emotion Detection", cv2.WND_PROP_VISIBLE) < 1:
        break

cap.release()
cv2.destroyAllWindows()


Final version of the code. For additional details, refer to `Prototype/FER/Models/Info.md`.

In [2]:
import cv2
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import numpy as np
import time
import os
from collections import deque

# Load model and processor
processor = AutoImageProcessor.from_pretrained("trpakov/vit-face-expression")
model = AutoModelForImageClassification.from_pretrained("trpakov/vit-face-expression")

# Load OpenCV's Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# Define a buffer for recent emotion scores
maxlen = 15
recent_scores = deque(maxlen=maxlen)

def get_log_filename(base_name="logs/emotion_log"):
    """Generate a unique log filename by appending a counter if files already exist."""
    os.makedirs("logs", exist_ok=True)  # Ensure the logs folder exists
    counter = 0
    while True:
        filename = f"{base_name}_{counter}.txt"
        if not os.path.exists(filename):
            return filename
        counter += 1

def process_video(video_file, show_preview=True):
    # Determine video source
    video_source = 0 if video_file == "camera" else video_file

    # Open video capture
    cap = cv2.VideoCapture(video_source)
    if not cap.isOpened():
        print(f"Error: Could not open video source '{video_source}'.")
        return

    # Generate a unique log filename
    log_filename = get_log_filename()
    log_file = open(log_filename, "w")
    log_file.write("Timestamp,Emotion,Score\n")

    # Variables for FPS calculation
    frame_count = 0
    start_time = time.time()

    while True:
        success, frame = cap.read()
        if not success:
            break

        # Convert to grayscale for face detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

        for (x, y, w, h) in faces:
            # Extract face ROI
            face_roi = frame[y:y + h, x:x + w]

            try:
                # Convert to PIL Image and preprocess
                face_pil = Image.fromarray(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
                inputs = processor(face_pil, return_tensors="pt")

                # Run inference
                outputs = model(**inputs)
                probs = torch.nn.functional.softmax(outputs.logits, dim=1)
                recent_scores.append(probs.detach().numpy())

                # Compute smoothed scores
                avg_scores = np.mean(recent_scores, axis=0).squeeze()
                predicted_class = np.argmax(avg_scores)
                predicted_label = model.config.id2label[predicted_class]

                if video_file == "camera":
                    # Get the current timestamp
                    timestamp = time.time()
                else:
                    timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0  # Convert to seconds

                # Log emotion and score
                log_file.write(f"{timestamp:.2f},{predicted_label},{avg_scores.tolist()}\n")

                # Display the detected emotion on the video (if preview is enabled)
                if show_preview:
                    cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
                    cv2.putText(frame, predicted_label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

            except Exception as e:
                print(f"Error analyzing face: {e}")

        # Increment frame count
        frame_count += 1

        # Show the video frame with emotion labels (if preview is enabled)
        if show_preview:
            cv2.imshow("ViT Expression Recognition", frame)

            # Break on 'q' key press or window close
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            if cv2.getWindowProperty("ViT Expression Recognition", cv2.WND_PROP_VISIBLE) < 1:
                break

    # Calculate FPS
    end_time = time.time()
    total_time = end_time - start_time
    fps = frame_count / total_time
    print(f"Processed {frame_count} frames in {total_time:.2f} seconds. FPS: {fps:.2f}")

    # Log the FPS
    log_file.write(f"\nProcessed {frame_count} frames in {total_time:.2f} seconds.\n")
    log_file.write(f"FPS: {fps:.2f}\n")
    log_file.close()

    # Release resources
    cap.release()
    if show_preview:
        cv2.destroyAllWindows()
        cv2.waitKey(1)

if __name__ == '__main__':
    # Replace 'input_video.mp4' with the path to your video file or use "camera" for live feed
    process_video('../ExampleVideos/cicciogamer89.mp4', show_preview=True)


Processed 963 frames in 189.03 seconds. FPS: 5.09
