## Demo Emotion Recognition: audio + video
### Libraries and parameters

In [1]:
# Utilities
import os
import subprocess
import numpy as np
import keras
from keras_cv.layers import RandomCutout
# Audio and video manipulation
import moviepy.editor as mp
import cv2
import librosa
from joblib import load
import tkinter as tk
from PIL import ImageTk, Image

You do not have pycocotools installed, so KerasCV pycoco metrics are not available. Please run `pip install pycocotools`.
You do not have pyococotools installed, so the `PyCOCOCallback` API is not available.
You do not have Waymo Open Dataset installed, so KerasCV Waymo metrics are not available.


In [2]:
# Labels dictionary
emotions_tras = {1:1, 2:4, 3:5, 4:0, 5:3, 6:2, 7:6}
emotions = {0:'angry', 1:'calm', 2:'disgust', 3:'fear', 4:'happy', 5:'sad', 6:'surprise'}

# Paths
dataset_path = "./Examples/"
haar_path = './Other/haarcascade_frontalface_default.xml'
parameters_path = './Other/audio_test/std_scaler.bin'
models_video_path = "./Models/Video_Stream/"
models_audio_path = "./Models/Audio_Stream/"
vlc_path = "D:/Program Files/VLC/vlc.exe" # to play the selected video (insert your own path to vlc.exe)

# Audio video parameters
height_targ = 112
width_targ = 112
sr = 48000

### Creatw Window App

In [3]:
root= tk.Tk()

root.title("Selection Emotions")

canvas1 = tk.Canvas(root, bg="#EEDFCC", width=650, height=500)
canvas1.pack()

label1 = tk.Label(root, text='App For Multimodal Emotion Recogntion', bg="#EEDFCC")
label1.config(font=('helvetica', 23, 'bold', 'italic'))
canvas1.create_window(325, 40, window=label1)

label2 = tk.Label(root, text='Demonstration of fusion model (video + audio) for each of 7 emotions.', bg="#EEDFCC")
label2.config(font=('Times', 13))
canvas1.create_window(325, 100, window=label2)

label3 = tk.Label(root, text='Number from 0 to 6:', bg="#EEDFCC")
label3.config(font=('helvetica', 0))
canvas1.create_window(325, 225, window=label3)

label4 = tk.Label(root, text='0. Calm\n1. Happy\n2. Sad\n3. Angry\n4. Fear\n5. Disgust\n6 Surprise', bg="#EEDFCC")
label4.config(font=('Times', 13))
canvas1.create_window(325, 310, window=label4)

def display_text():
   global example
   example = int(example.get())
   root.destroy

example = tk.Entry(root)
example.pack()
canvas1.create_window(325, 150, window=example)

    
button1 = tk.Button(text='Select Emotion', command=lambda: [display_text(), root.destroy()], font=('helvetica', 12, 'bold'), relief="groove", bg="#CDAA7D")
canvas1.create_window(325, 185, window=button1)

img = ImageTk.PhotoImage(Image.open("./Other/UTH-logo-english.png").resize((120,120)))
label5 = tk.Label(root, image = img, bg="#EEDFCC")
canvas1.create_window(100, 430, window=label5)

label6 = tk.Label(root, text="Marios-Chrysostomos Askitis: 2760\nFreris Leonardos: 2696", bg="#EEDFCC")
label6.config(font=('Times', 11))
canvas1.create_window(325, 440, window=label6)

label7 = tk.Label(root, text="Autumn 2023", bg="#EEDFCC")
label7.config(font=('Times', 10))
canvas1.create_window(590, 470, window=label7)

root.mainloop()

Transposition of the emotion in order to obtain the ground label.

In [4]:
fn = os.listdir(dataset_path)
filename = dataset_path + fn[example]
label = emotions_tras[int(fn[example].split('-')[2]) - 1] # trasposition of the emotions

Play the selected emotion video.

In [5]:
player = subprocess.call([vlc_path, filename, '--play-and-exit'])


### Data preparation
#### Video

Extract frames of the video.

In [6]:
cap = cv2.VideoCapture(filename)
haar_cascade = cv2.CascadeClassifier(haar_path)
frames = []
count = 0
skip = 3

# Loop through all frames
while True:
    # Capture frame
    ret, frame = cap.read()
    if (count % skip == 0 and count > 20):
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # detect and crop face
        faces = haar_cascade.detectMultiScale(frame, scaleFactor=1.12, minNeighbors=9)
        if len(faces) != 1:
            continue
        for (x, y, w, h) in faces:
            face = frame[y:y + h, x:x + w]

        face = cv2.resize(face, (height_targ+10, width_targ+10))
        face = face[5:-5, 5:-5]
        face = face/255.
        frames.append(face)
    count += 1

frames = np.array(frames)
num_frames = len(frames)
labels = [label] * num_frames
print('shape frames:', frames.shape)

shape frames: (32, 112, 112)


#### Audio

We take the audio from the clip and we save it as .wav file at the folder "Other".

In [7]:
audiofile = mp.AudioFileClip(filename).set_fps(sr)
audiofile.write_audiofile("./Other/example.wav")

MoviePy - Writing audio in ./Other/example.wav


                                                                 

MoviePy - Done.




We load the audiofile and we trim it. Then we pad it with zeros in order to has the same length that the model requires.

In [8]:
y, sampling_rate = librosa.load("./Other/example.wav")
y_trimmed, _ = librosa.effects.trim(y, top_db = 30)
if(len(y_trimmed) != 93696):
    start_pad = (93696 - len(y_trimmed))//2
    end_pad = 93696 - len(y_trimmed) - start_pad
    y_final = np.pad(y_trimmed, (start_pad, end_pad), mode = 'constant')
else:
    y_final = y_trimmed

We extract the mel spectograms features, we load the scaler parameters and trasform to the required shape.

In [9]:
mel = librosa.power_to_db(librosa.feature.melspectrogram(y = y_final, sr = 48000, n_fft = 1024, n_mels = 128, fmin = 50, fmax = 24000)) 

scaler = load(parameters_path)
mel = scaler.transform(mel)

mel = np.expand_dims(mel, axis = 2)
mel = np.expand_dims(mel, axis = 0)
mel.shape

(1, 128, 184, 1)

### Load models
#### Video

In [10]:
models_list = os.listdir(models_video_path)
model_video = keras.models.load_model(models_video_path + models_list[0])
# model_video.summary()

#### Audio

In [11]:
models_list = os.listdir(models_audio_path)
model_audio = keras.models.load_model(models_audio_path + models_list[0])
# model_audio.summary()

### Predictions
#### Video

In [12]:
pred = model_video.predict(frames)
pred_video = np.mean(pred, axis=0)
pred_video



array([1.6348714e-04, 5.4474822e-05, 6.1063957e-03, 9.1054225e-01,
       1.1759001e-02, 1.0859572e-02, 6.0514912e-02], dtype=float32)

#### Audio

In [13]:
pred = model_audio.predict(mel)
pred_audio = np.mean(pred, axis=0)
pred_audio



array([3.0805247e-06, 5.4812928e-05, 3.0281662e-03, 9.9640155e-01,
       1.4702437e-06, 5.1068712e-04, 2.3837696e-07], dtype=float32)

#### Global

In [14]:
pred_global = pred_video + pred_audio # mean

In [15]:
print('Video prediction:\t', emotions[pred_video.argmax()])
print('Audio prediction:\t', emotions[pred_audio.argmax()])
print('Global prediction:\t', emotions[pred_global.argmax()])

print('Ground truth:\t\t', emotions[label])

Video prediction:	 fear
Audio prediction:	 fear
Global prediction:	 fear
Ground truth:		 fear


In [30]:
# Print Results
root = tk.Tk()

# root window title and dimension
root.title("Results")

canvas1 = tk.Canvas(root, bg="#EEDFCC", width=650, height=500)
canvas1.pack()

label1 = tk.Label(root, text=f'Video prediction:\t{emotions[pred_video.argmax()]}')
label2 = tk.Label(root, text=f'Audio prediction:\t{emotions[pred_audio.argmax()]}')
label3 = tk.Label(root, text=f'Fusion prediction:\t{emotions[pred_global.argmax()]}')
label4 = tk.Label(root, text=f'Actual Label:\t{emotions[label]}')

label1.config(font=('Times', 18), bg="#EEDFCC")
label2.config(font=('Times', 18), bg="#EEDFCC")
label3.config(font=('Times', 18), bg="#EEDFCC")
label4.config(font=('Times', 18, 'bold'), bg="#EEDFCC")

canvas1.create_window(325, 75, window=label1)
canvas1.create_window(325, 105, window=label2)
canvas1.create_window(325, 145, window=label3)
canvas1.create_window(325, 210, window=label4)

button1 = tk.Button(text='Close', command=lambda: root.destroy(), font=('helvetica', 12, 'bold'), relief="groove", bg="#CDAA7D")
canvas1.create_window(325, 300, window=button1)

img = ImageTk.PhotoImage(Image.open("./Other/UTH-logo-english.png").resize((120,120)))
label5 = tk.Label(root, image = img, bg="#EEDFCC")
canvas1.create_window(100, 430, window=label5)

label6 = tk.Label(root, text="Marios-Chrysostomos Askitis: 2760\nFreris Leonardos: 2696", bg="#EEDFCC")
label6.config(font=('Times', 11))
canvas1.create_window(325, 440, window=label6)

label7 = tk.Label(root, text="Autumn 2023", bg="#EEDFCC")
label7.config(font=('Times', 10))
canvas1.create_window(590, 470, window=label7)

root.mainloop()