In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install the required libraries.
!pip install tensorflow opencv-contrib-python youtube-dl moviepy pydot
!pip install git+https://github.com/TahaAnwar/pafy.git#egg=pafy
!pip install opencv-python
!pip install mediapipe
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting youtube-dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.0 MB/s 
Installing collected packages: youtube-dl
Successfully installed youtube-dl-2021.12.17
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pafy
  Cloning https://github.com/TahaAnwar/pafy.git to /tmp/pip-install-cw4z05ks/pafy_54d2ac1ebffb448eb0eec2b4055b23f7
  Running command git clone -q https://github.com/TahaAnwar/pafy.git /tmp/pip-install-cw4z05ks/pafy_54d2ac1ebffb448eb0eec2b4055b23f7
Building wheels for collected packages: pafy
  Building wheel for pafy (setup.py) ... [?25l[?25hdone
  Created wheel for pafy: filename=pafy-0.5.5-py2.py3-none-any.whl size=35706 sha256=21504576475bd80cf9bac111d39af5eb504537236557f4b983bc928b0ac63c84
  Stored in directory: /tmp/pip-ephem-wheel-cache-

In [1]:
## Python
import os
import random
import sys

import IPython
from IPython.display import Audio
from IPython.display import Image
import matplotlib.pyplot as plt

## Package
import glob 
import keras
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import gradio as gr
from moviepy.editor import VideoFileClip
import cv2
import mediapipe as mp
import joblib

In [2]:
#extract landmark to crop frames and prepare them for testing
def extract_frames(video):
  frames=[]
  seq_len=30
  #video=VideoFileClip(video)
  times = list(np.arange(0, video.duration, video.duration/seq_len))
  for i, t in enumerate(times):
            image = cv2.cvtColor(video.get_frame(t), cv2.COLOR_BGR2RGB)
  
            # Face Mesh
            mp_face_mesh = mp.solutions.face_mesh
            face_mesh = mp_face_mesh.FaceMesh()
            rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Facial landmarks
            result = face_mesh.process(rgb_image)
            height, width, _ = image.shape

            #find the 468 facial landmarks
            for facial_landmarks in result.multi_face_landmarks:
                for i in range(0, 468):
                    pt1 = facial_landmarks.landmark[i]
                    x = int(pt1.x * width)
                    y = int(pt1.y * height) 
                    
                    if i == 10:
                      ymin=y
                      
                    elif i == 152:
                      ymax=y
                      
                    elif i == 234:
                      xmin=x
                    elif i == 454:
                      xmax=x
            
                # define face boundries to crop
                top= ymin
                bottom= ymax
                left= xmin
                right= xmax
                #crop the frames
                cropped = image[top:bottom, left:right ]

                # Resize the Frame to fixed height and width.
                resized_frame = cv2.resize(cropped, (64, 64))
                frames.append(resized_frame)
                
                #save cropped frames in drive
                #cv2.imwrite(os.path.join("/content/drive/MyDrive/demo_audio/frames", "cropped_frame_" + str(t) + ".jpg"), resized_frame)

  return frames

In [3]:
mfccs=[]
#frames=[]
class_names = ['neutral','calm', 'happy','sad','angry', 'fearful' ]

#audio emotion recoginition
def audio_sys(audio):
  #load audio model
  audio_model = keras.models.load_model('/content/drive/MyDrive/modelssaved/audio/audiotest (1).h5')
  #load audio
  x,rate = librosa.load(audio, res_type='kaiser_fast')
  # extract mfcc for audio
  mfccs.append(np.mean(librosa.feature.mfcc(y=x, sr=rate, n_mfcc=40).T,axis=0))
  test=np.asarray(mfccs)
  testcnn = np.expand_dims(test, axis=2)
  #predict emotion 
  predic_audio = audio_model.predict(testcnn)
  predic1=predic_audio.argmax(axis=1)
  audio_label=class_names[predic1[0]]
  return predic_audio,audio_label

def visual_sys(video):
  frames=[]
  totalframes=[]
  frames=extract_frames(video)
  for i in range(30):
    totalframes.append(frames[i])
  totalframes=np.asarray(totalframes)
  #print(frames.shape)
  video_model= keras.models.load_model('/content/drive/MyDrive/modelssaved/video/modelvideotest.h5')
  predic_video=video_model.predict(np.expand_dims(totalframes, axis = 0))
  predic1=predic_video.argmax(axis=1)
  video_label=class_names[predic1[0]]
  return predic_video,video_label

def fusion_sys(audio_prediction,video_prediction):
  df_video=pd.DataFrame(video_prediction,columns=["OV_"+str(i) for i in range(6)]) 
  df_audio=pd.DataFrame(audio_prediction,columns=["OU_"+str(i) for i in range(6)]) 
  df=pd.concat([df_video, df_audio], axis=1,join='inner')
  fusion_model=joblib.load('/content/drive/My Drive/fusion/fusion_model/MLP/expert1')
  final_predict=fusion_model.predict(df)
  final_predict=class_names[final_predict[0]]
  return final_predict

def audio_visual_sys(webcam=None, file=None):
  if webcam is not None:
        video = webcam
  elif file is not None:
        video = file
  Video = VideoFileClip(video)
  Video.audio.write_audiofile("/content/drive/MyDrive/demo_audio/audio.wav")
  audio="/content/drive/MyDrive/demo_audio/audio.wav"
  #audio = Video.audio
  audio_prediction,audio_label=audio_sys(audio)
  video_prediction,video_label=visual_sys(Video)
  final_predict=fusion_sys(audio_prediction,video_prediction)
  return audio_label,video_label,final_predict

demo = gr.Interface(fn=audio_visual_sys, inputs=[
        gr.Video(source="webcam", type="filepath", optional=True),
        gr.Video(source="upload", type="filepath", optional=True),
    ],  outputs = [gr.outputs.Textbox(label="Audio Output Class"),
                   gr.outputs.Textbox(label="Video Output Class"),
                    gr.outputs.Textbox(label="fusion Output Class")]
    ,title = "Audio Visual Emotion Recognition System"
)

demo.launch(debug=True, inline=True)
#YAF_haze_neutral.wav

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://49581.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 56/56 [00:00<00:00, 1199.95it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 53/53 [00:00<00:00, 1412.47it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 80/80 [00:00<00:00, 1390.58it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 80/80 [00:00<00:00, 1385.28it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 80/80 [00:00<00:00, 1334.19it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 80/80 [00:00<00:00, 1179.86it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 53/53 [00:00<00:00, 1543.37it/s]

[MoviePy] Done.





[MoviePy] Writing audio in /content/drive/MyDrive/demo_audio/audio.wav


100%|██████████| 80/80 [00:00<00:00, 1523.44it/s]

[MoviePy] Done.





Keyboard interruption in main thread... closing server.


(<gradio.routes.App at 0x7f139306e950>,
 'http://127.0.0.1:7860/',
 'https://49581.gradio.app')