In [1]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd  # To play sound in the notebook
import pyaudio
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from matplotlib.animation import FuncAnimation
from keras.models import load_model
import noisereduce as nr
import math
import os, sys
import random
from threading import Thread

matplotlib.use('Qt5Agg')
plt.style.use('dark_background')
plt.rc('figure', titlesize=16)
plt.rc('axes', labelsize=12)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [4]:
class emotion_detector:
    # ['angry', 'calm', 'disgust', 'fear', 'happy', 'sad', 'surprise']
    def __init__(self, model_path, sample_rate=22050, threshold=122):
        self.MODEL = load_model(model_path)
        self.genders = {0: 'Female', 1: 'Male'}
        self.THRESHOLD = threshold
        self.FORMAT = pyaudio.paFloat32
        self.CHANNELS = 1
        self.RATE = sample_rate
        self.sr = self.RATE
        self.CHUNK = 1024 * 4
        self.COLORS = ['lightpink', 'green']
        self.gender = "Silence"
        self.predictions = [0]
        self.LENGTH = 121212
        self.AUDIO = pyaudio.PyAudio()
        self.FRAME_LENGTH = 2048
        self.HOP_LENGTH = 512
        self.RECORD_SECONDS = 2.6
        self.audio = (np.sin(np.pi * np.arange(self.RECORD_SECONDS) * 54 /
                             self.sr)).astype(np.float32)

        self.SCALER = StandardScaler()
        self.FIG, self.AXES = plt.subplots(1,
                                           2,
                                           figsize=(14, 8),
                                           tight_layout=True)
        self.FIG.canvas.mpl_connect('close_event', self.stop_stream)
        self.stop_flag = False
        self.gen_color = {
            'Female': 'lightpink',
            'Male': 'green',
            'Silence': 'white'
        }

    def __analyser(self, frame):
        self.AXES[0].clear()
        self.AXES[1].clear()

        self.AXES[0].set_ylim(0, 1)
        self.AXES[1].set_ylim(-1, 1)

        plt.suptitle("\n\n\n" + self.gender, va='center', fontweight="bold")

        self.AXES[0].bar(self.gender, 1, color=self.gen_color[self.gender])

        librosa.display.waveshow(y=self.audio,
                                 sr=self.sr,
                                 ax=self.AXES[1],
                                 color=self.gen_color[self.gender])

    def __extract_features(self, audio, sr):
        rms = []
        mfcc = []
        mel = []

        # Fetch the sample rate.
        normalizedsound = librosa.util.normalize(audio)

        # Trim silence from the beginning and the end.
        trimmed_audio, index = librosa.effects.trim(y=normalizedsound,
                                                    top_db=30)

        final_audio = np.pad(trimmed_audio,
                             (0, self.LENGTH - len(trimmed_audio)), 'constant')
        # Noise reduction.
        final_audio = nr.reduce_noise(y=final_audio,
                                      sr=self.RATE)  #updated 03/03/22

        f1 = librosa.feature.rms(
            y=final_audio,
            frame_length=self.FRAME_LENGTH,
            hop_length=self.HOP_LENGTH).T  # Energy - Root Mean Square

        f2 = librosa.feature.melspectrogram(y=final_audio,
                                            sr=sr,
                                            n_fft=self.FRAME_LENGTH,
                                            hop_length=self.HOP_LENGTH).T

        f3 = librosa.feature.mfcc(y=final_audio,
                                  sr=sr,
                                  n_mfcc=40,
                                  hop_length=self.HOP_LENGTH).T  # MFCC

        # Filling the data lists

        rms.append(self.SCALER.fit_transform(f1))
        mel.append(self.SCALER.fit_transform(f2))
        mfcc.append(self.SCALER.fit_transform(f3))

        f_rms = np.asarray(rms)
        f_mel = np.asarray(mel)
        f_mfccs = np.asarray(mfcc)

        # Concatenating all features to 'X' variable.
        features = np.concatenate((f_rms, f_mel, f_mfccs), axis=2)
        return features

    def __gender(self, audio_features):
        predictions = self.MODEL.predict(audio_features,
                                         use_multiprocessing=True)
        print(predictions)

        prediction = int(predictions.round()[0][0])
        # predictions = np.squeeze(np.array(pred_list).tolist(), axis=0)
        print(prediction)

        return prediction, self.genders[prediction]

    def list_devices(self):
        print("----------------------record device list---------------------")
        info = self.AUDIO.get_host_api_info_by_index(0)
        numdevices = info.get('deviceCount')
        for i in range(0, numdevices):
            if (self.AUDIO.get_device_info_by_host_api_device_index(
                    0, i).get('maxInputChannels')) > 0:
                print(
                    "Input Device id ", i, " - ",
                    self.AUDIO.get_device_info_by_host_api_device_index(
                        0, i).get('name'))

        print("-------------------------------------------------------------")

        index = int(input())
        return index

    def silence(self, audio):
        threshold = (sum(audio) / len(audio))
        print(threshold)
        return (sum(audio) / len(audio)) < self.THRESHOLD

    def __start(self, file=None, device_index=None):

        if file != None:
            try:
                audio_data, self.sr = librosa.load(file)
                ipd.display(ipd.Audio(data=audio_data, rate=self.sr))
                n = len(audio_data) / self.LENGTH
                if n < 1:
                    self.audio = audio_data
                    features = self.__extract_features(audio_data, self.sr)
                    self.predictions, self.gender = self.__gender(features)
                    print(self.predictions, self.gender)

                else:
                    for i in range(math.floor(n)):
                        self.audio = audio_data[self.LENGTH * i:self.LENGTH *
                                                (i + 1)]
                        features = self.__extract_features(audio, self.sr)
                        self.predictions, self.gender = self.__gender(features)
                        print(self.predictions, self.gender)

                    else:
                        self.audio = audio_data[self.LENGTH * i:]
                        features = self.__extract_features(audio, self.sr)
                        self.predictions, self.gender = self.__gender(features)
                        print(self.predictions, self.gender)

            except Exception as e:
                print(e)

        else:
            if device_index == None:
                print("Missing Device Index Or File !")
                sys.exit(1)
                # index = self.list_devices()
            print("recording via index " + str(device_index))

            self.STREAM = self.AUDIO.open(format=self.FORMAT,
                                          channels=self.CHANNELS,
                                          rate=self.RATE,
                                          input=True,
                                          input_device_index=device_index,
                                          frames_per_buffer=self.CHUNK)
            self.sr = self.RATE
            n = int(self.RATE / self.CHUNK * self.RECORD_SECONDS)
            print(n)
            # try:
            while not self.stop_flag:
                # print("recording started")
                Recordframes = []
                for i in range(0, n):
                    data = self.STREAM.read(self.CHUNK,
                                            exception_on_overflow=False)
                    Recordframes.append(data)
                # print ("recording stopped")
                # print(len(Recordframes))
                self.audio = np.frombuffer(b''.join(Recordframes),
                                           dtype=np.float32)
                # ipd.display(ipd.Audio(data=self.audio, rate=self.RATE))
                # time.sleep(5)
                if self.silence(b''.join(Recordframes[-4:])):
                    # print("Silence Detected !")
                    self.gender = "Silence"
                else:
                    features = self.__extract_features(self.audio, self.RATE)
                    self.predictions, self.gender = self.__gender(features)
                    # print(emotion)

        print("Main Thread Terminated !")

    def start_stream(self, file=None, device_index=None):
        print("Stream Started !")
        self.main_thread = Thread(target=self.__start,
                                  args=(file, device_index))
        self.main_thread.start()
        self.anim = FuncAnimation(fig=self.FIG,
                                  func=self.__analyser,
                                  interval=1)
        plt.show()

    def stop_stream(self, event=None):

        self.stop_flag = True
        self.main_thread.join()
        self.STREAM.stop_stream()
        self.STREAM.close()
        self.AUDIO.terminate()
        print("Stream Stoped !")

In [5]:
ed = emotion_detector('Models/Gender_Classifier.hdf5')
device_index = ed.list_devices()
ed.start_stream(device_index=device_index)

----------------------record device list---------------------
Input Device id  0  -  MacBook Pro Microphone
Input Device id  2  -  Kishan’s AirPods Pro 🦁
Input Device id  4  -  Microsoft Teams Audio
-------------------------------------------------------------


 2


Stream Started !
recording via index 2
13
67.68107604980469


Exception ignored in: <function WeakMethod.__new__.<locals>._cb at 0x17fabe820>
Traceback (most recent call last):
  File "/Users/kishan/opt/miniforge3/lib/python3.9/weakref.py", line 58, in _cb
    if self._alive:
AttributeError: 'NoneType' object has no attribute '_alive'


123.36097717285156


2022-08-03 20:46:02.768512: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-03 20:46:02.838977: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-03 20:46:03.188466: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[[4.503583e-05]]
0
123.63334655761719
[[3.936874e-05]]
0
123.60038757324219
[[3.5700294e-05]]
0
124.08818054199219
[[0.9999902]]
1
123.91012573242188
[[0.99999297]]
1
123.651123046875
[[0.99998736]]
1
123.93876647949219
[[7.356672e-05]]
0
124.43962097167969
[[5.1164145e-05]]
0
124.26443481445312
[[0.9999788]]
1
123.59437561035156
[[4.0985557e-05]]
0
123.82223510742188
[[6.8277586e-05]]
0
124.0076904296875
[[0.99999166]]
1
124.01226806640625
[[0.99998856]]
1
124.09262084960938
[[0.99999154]]
1
124.28144836425781
[[0.9999815]]
1
123.86201477050781
[[0.99999416]]
1
123.89878845214844
[[0.99999]]
1
124.01156616210938
[[0.9999807]]
1
125.19181823730469
[[0.00137052]]
0
124.42427062988281
[[4.2331376e-05]]
0
124.76402282714844
[[0.9999869]]
1
125.18119812011719
[[0.9999747]]
1
124.34933471679688
[[0.9999881]]
1
123.77301025390625
[[0.00011504]]
0
124.24180603027344
[[0.99998915]]
1
Main Thread Terminated !
Stream Stoped !


In [31]:
RAVDESS = "Data/RAVDESS/audio_speech_actors_01-24/"
TESS = "Data/TESS/"
datafiles = []
for i in os.listdir(TESS):
    datafiles.append(TESS + i)

for i in os.listdir(RAVDESS):
    if os.path.isdir(RAVDESS + i):
        for j in os.listdir(RAVDESS + i):
            datafiles.append(RAVDESS + i + '/' + j)

In [32]:
fig, ax = plt.subplots(1, 2, figsize=(14, 8))

In [33]:
ed = emotion_detector('Models/SER.hdf5')

In [43]:
file = random.choice(datafiles)
print(file)
ed.start_stream(file=file)

Data/TESS/YAF_wire_ps.wav
Stream Started !




[1.19470767e-09 2.49837882e-07 1.92914385e-07 3.56488476e-06
 7.52380083e-06 7.50631557e-09 9.99988437e-01] Surprise
Main Thread Terminated !


In [16]:
plt.close()