# Emotion detection on video file

https://www.youtube.com/watch?v=44elcDAJWyo

### Imports

In [33]:
from collections import defaultdict
import cv2
from PIL import Image
import matplotlib.pyplot as plt

import torch
from torchvision import transforms
from ultralytics import YOLO

from emotions_utils import IDX_TO_CLASS

In [2]:
VIDEO_PATH = "emotions_video.mp4"
EMOTION_MODEL_PATH = "emotion_model.pth"
FRAME_SIZE = 640

Face detection

In [3]:
from DDAMFN.networks.DDAM import DDAMNet

classification_model = DDAMNet(pretrained=False)
checkpoint = torch.load(EMOTION_MODEL_PATH)
classification_model.load_state_dict(checkpoint['model_state_dict'])
classification_model.eval()
classification_model = classification_model.to("cuda")

In [4]:
classification_transforms=transforms.Compose([
        transforms.Resize((112, 112)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])])

softmax = torch.nn.Softmax(1)

def classify_emotions(frame):
    frame = classification_transforms(frame).to("cuda")
    emotions = classification_model(frame[None, :])[0].detach().cpu()
    return softmax(emotions)

source: https://github.com/akanametov/yolov8-face

In [29]:
detection_model = YOLO("yolov8n-face.pt")
detection_model = detection_model.to("cuda")

In [30]:
def detect_faces(frame, verbose=False):
    return detection_model.track(frame, conf=0.5, verbose=verbose)

Main algorithm

In [31]:
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 1
COLOR = (0, 0, 255)
THICKNESS = 2 

In [34]:
cam = cv2.VideoCapture(VIDEO_PATH)

if not cam.isOpened():
    raise Exception("Can not read input")

INITIAL_RES = (int(cam.get(3)), int(cam.get(4)))
FPS = cam.get(cv2.CAP_PROP_FPS)
FPS_COUNT = cam.get(cv2.CAP_PROP_FRAME_COUNT)
fourcc =  cv2.VideoWriter_fourcc(*'XVID')
writer = cv2.VideoWriter('output.avi', fourcc, FPS, INITIAL_RES)

In [35]:
frame_count = 0
track_history = defaultdict(lambda: [])

while True:
    success, frame = cam.read()
    if not success:
        raise Exception("Error during reading the frames")
    
    frame_count += 1
    if frame_count == FPS_COUNT:
        break
    
    frame = cv2.resize(frame, (FRAME_SIZE, FRAME_SIZE))

    faces = detect_faces(frame, verbose=False)[0]
    for bbox in faces.boxes:
        center_x, center_y, _, _ = bbox.xywh[0]
        track_id = bbox.id.int().item()
        track = track_history[track_id]
        track.append((center_x, center_y))
        if len(track) > FPS:
            track.pop(0) 

        x1, y1, x2, y2 = bbox.xyxy[0]
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        h, w = y2 - y1, x2 - x1

        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        region_of_interest = Image.fromarray(rgb[y1:y1+h, x1:x1+w])
        emotions = classify_emotions(region_of_interest)
        predicted_emotion = emotions.argmax().item()

        distribution = {idx: value for idx, value in enumerate(emotions.flatten())}
        distribution = {k: v.item() for k, v in sorted(distribution.items(), key=lambda x: x[1], reverse=True)}

        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0))
        cv2.putText(frame, IDX_TO_CLASS[predicted_emotion],
                    (x1, y1), FONT, FONT_SCALE, COLOR, THICKNESS)
        
    frame = cv2.resize(frame, INITIAL_RES)

    writer.write(frame)
    cv2.imshow("Camera video", frame)
    if cv2.waitKey(1) == ord("q"):
        break

writer.release()
cam.release()
cv2.destroyAllWindows()