# Emotion Recognition from Video

In [None]:
# !pip install torch==1.2.0 torchvision==0.4.0   numpy==1.18.1 #if necessary
# !wget https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz -O en_vectors_web_lg-2.1.0.tar.gz
# !pip install -q en_vectors_web_lg-2.1.0.tar.gz

In [None]:
%cd /content/drive/MyDrive/projects/mosei_umons

%reload_ext autoreload
%autoreload 2

/content/drive/MyDrive/projects/mosei_umons


In [None]:
import re
import glob
import pickle
import os
import torch
import numpy as np
from utils.audio import load_spectrograms
from utils.compute_args import compute_args
from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature
from model_LA import Model_LA

import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

working_dir = "."
data_dir = "data/CMU_MOSEI"

In [None]:
# load model

ckpts_path = os.path.join(working_dir, 'ckpt')
model_name = "Model_LA_e"
# Listing sorted checkpoints
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True)

# Load original args
args = torch.load(ckpts[0], map_location=torch.device(device))['args']
args = compute_args(args)
pretrained_emb = np.load("train_glove.npy")
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb")) 
state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict']

net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
net.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
video_path = os.path.join(working_dir, 'data/video/03bSnISJMiM_1.mp4')
transcript_path = os.path.join(working_dir, 'data/transcripts/03bSnISJMiM_1.txt')
transcript = None

### Record video

In [None]:
from IPython.display import display, Javascript,HTML
from google.colab.output import eval_js
from base64 import b64decode

def record_video(filename):
  js=Javascript("""
    async function recordVideo() {
      const options = { mimeType: "video/webm; codecs=vp9" };
      const div = document.createElement('div');
      const capture = document.createElement('button');
      const stopCapture = document.createElement("button");
      
      capture.textContent = "Start Recording";
      capture.style.background = "orange";
      capture.style.color = "white";

      stopCapture.textContent = "Stop Recording";
      stopCapture.style.background = "red";
      stopCapture.style.color = "white";
      div.appendChild(capture);

      const video = document.createElement('video');
      const recordingVid = document.createElement("video");
      video.style.display = 'block';

      const stream = await navigator.mediaDevices.getUserMedia({audio:true, video: true});
    
      let recorder = new MediaRecorder(stream, options);
      document.body.appendChild(div);
      div.appendChild(video);

      video.srcObject = stream;
      video.muted = true;

      await video.play();

      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      await new Promise((resolve) => {
        capture.onclick = resolve;
      });
      recorder.start();
      capture.replaceWith(stopCapture);

      await new Promise((resolve) => stopCapture.onclick = resolve);
      recorder.stop();
      let recData = await new Promise((resolve) => recorder.ondataavailable = resolve);
      let arrBuff = await recData.data.arrayBuffer();
      
      // stop the stream and remove the video element
      stream.getVideoTracks()[0].stop();
      div.remove();

      let binaryString = "";
      let bytes = new Uint8Array(arrBuff);
      bytes.forEach((byte) => {
        binaryString += String.fromCharCode(byte);
      })
    return btoa(binaryString);
    }
  """)
  try:
    display(js)
    data=eval_js('recordVideo({})')
    binary=b64decode(data)
    with open(filename,"wb") as video_file:
      video_file.write(binary)
    print(f"Finished recording video at:{filename}")
  except Exception as err:
    print(str(err))

In [None]:
record_video("test.mp4")
transcript = "oh my god that is amazing!" # what you said in video

<IPython.core.display.Javascript object>

Finished recording video at:test.mp4


### Preview video

In [None]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open(video_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

## Data preprocessing

In [None]:
# text
def clean(w):
    return re.sub(
            r"([.,'!?\"()*#:;])",
            '',
            w.lower()
            ).replace('-', ' ').replace('/', ' ')

text = open(transcript_path, 'r').read() if transcript is None else transcript
s = [clean(w) for w in text.split() if clean(w) != '']

# Sound
_, mel, mag = load_spectrograms(video_path)

l_max_len = args.lang_seq_len
a_max_len = args.audio_seq_len
v_max_len = args.video_seq_len
L = sent_to_ix(s, token_to_ix, max_token=l_max_len)
A = pad_feature(mel, a_max_len)
V = pad_feature(mel, v_max_len)
# print shapes
print("Processed text shape: ", L.shape)
print("Processed audio shape: ", A.shape)
print("Processed video shape: ", V.shape)



Processed text shape:  (60,)
Processed audio shape:  (60, 80)
Processed video shape:  (60, 80)


# Prediction

In [None]:
net.train(False)
x = np.expand_dims(L,axis=0)
y = np.expand_dims(A,axis=0)
z = np.expand_dims(V,axis=0)
x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device)
pred = net(x, y, z).cpu().data.numpy()
print(pred)

[[ 1.0882487 -2.0272958 -2.84339   -2.2552867 -3.6238117 -4.0526347]]


In [None]:
label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
result_dict = dict(zip(label_to_ix, pred[0]>0))
result_dict

{'angry': False,
 'disgust': False,
 'fear': False,
 'happy': True,
 'sad': False,
 'surprise': False}