<a href="https://colab.research.google.com/github/JigarJoshi04/Speech_Emotion_Extractor/blob/master/Emotion_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install soundfile

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1


In [0]:
import soundfile
import numpy as np
import librosa
import glob
import os
from sklearn.model_selection import train_test_split

In [0]:
int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}


In [0]:
AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [0]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result


In [0]:
def load_data(test_size=0.2):
    X, y = [], []
    for file in glob.glob("/content/drive/My Drive/Ravdess database/Actor_*/*.wav"):
        print(file)
        # get the base name of the audio file
        if(file == "/content/drive/My Drive/Ravdess database/Actor_02/03-01-01-01-01-01-02.wav"):
          break
        basename = os.path.basename(file)
        # get the emotion label
        emotion = int2emotion[basename.split("-")[2]]
        # we allow only AVAILABLE_EMOTIONS we set
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        # extract speech features
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        # add to data
        X.append(features)
        y.append(emotion)
    # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [0]:
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
import pickle

In [21]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-01-01-02-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-03-01-01-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-04-01-01-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-03-01-02-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-01-02-02-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-01-01-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-02-01-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-03-02-02-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-03-01-02-02-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-03-01-01-02-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-01-01-01-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-01-02-01-01.wav
/content/drive/My Drive/Ravdess database/Actor_01/03-01-02-02-01-02-01.wav
/content/drive/My Drive/R

In [22]:
print("[+] Number of training samples:", X_train.shape[0])
print("[+] Number of testing samples:", X_test.shape[0])
print("[+] Number of features:", X_train.shape[1])

[+] Number of training samples: 46
[+] Number of testing samples: 16
[+] Number of features: 180


In [0]:
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08, 
    'hidden_layer_sizes': (300,), 
    'learning_rate': 'adaptive', 
    'max_iter': 500, 
}

In [0]:
model = MLPClassifier(**model_params)

In [25]:
print("[*] Training the model...")
model.fit(X_train, y_train)

[*] Training the model...




MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [0]:
y_pred = model.predict(X_test)

In [0]:
def my_load_data(test_size=1):
    X, y = [], []
    # for file in glob.glob("/content/drive/My Drive/Ravdess database/Actor_*/*.wav"):
    file = "/content/drive/My Drive/Ravdess database/Actor_08/03-01-01-01-01-01-08.wav"
    print(file)
        # # get the base name of the audio file
        # if(file == "/content/drive/My Drive/Ravdess database/Actor_02/03-01-01-01-01-01-02.wav"):
        #   break
    basename = os.path.basename(file)
        # # get the emotion label
    emotion = int2emotion[basename.split("-")[2]]
        # we allow only AVAILABLE_EMOTIONS we set
    # if emotion not in AVAILABLE_EMOTIONS:
    #     continue
    # extract speech features
    features = extract_feature(file, mfcc=True, chroma=True, mel=True)
    # add to data
    X.append(features)
    y.append(emotion)
    # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [91]:
my_test_feature = []
my_test_label =[]
file = "/content/drive/My Drive/Ravdess database/Actor_08/03-01-01-01-01-01-08.wav"
print(file)
basename = os.path.basename(file)
# # get the emotion label
emotion = int2emotion[basename.split("-")[2]]
features = extract_feature(file, mfcc=True, chroma=True, mel=True)
my_test_feature.append(features)
my_test_label.append(emotion)
print(my_test_label)

/content/drive/My Drive/Ravdess database/Actor_08/03-01-01-01-01-01-08.wav
['neutral']


In [92]:
my_y_pred =model.predict(my_test_feature)
print(my_y_pred)

['neutral']


In [89]:
my_X_test, my_y_test = my_load_data(test_size=0.9)

/content/drive/My Drive/Ravdess database/Actor_08/03-01-01-01-01-01-08.wav


ValueError: ignored

In [27]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 81.25%


In [0]:
if not os.path.isdir("result"):
    os.mkdir("result")

pickle.dump(model, open("result/mlp_classifier.model", "wb"))

In [34]:
# !apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
pip install pyaudio

Collecting pyaudio
  Using cached https://files.pythonhosted.org/packages/ab/42/b4f04721c5c5bfc196ce156b3c768998ef8c0ae3654ed29ea5020c749a6b/PyAudio-0.2.11.tar.gz
Building wheels for collected packages: pyaudio
  Building wheel for pyaudio (setup.py) ... [?25l[?25hdone
  Created wheel for pyaudio: filename=PyAudio-0.2.11-cp36-cp36m-linux_x86_64.whl size=51621 sha256=d39d0436e006ca3b05c82f824c361bf7d4d9a7d78de518d88a6ef82b50c9b8d4
  Stored in directory: /root/.cache/pip/wheels/f4/a8/a4/292214166c2917890f85b2f72a8e5f13e1ffa527c4200dcede
Successfully built pyaudio
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.11


In [0]:
import pyaudio
import wave
from sys import byteorder
from array import array
from struct import pack
from sklearn.neural_network import MLPClassifier


In [0]:
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 16000
SILENCE = 30

In [0]:
def is_silent(snd_data):
    "Returns 'True' if below the 'silent' threshold"
    return max(snd_data) < THRESHOLD

In [0]:
def normalize(snd_data):
    "Average the volume out"
    MAXIMUM = 16384
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

    r = array('h')
    for i in snd_data:
        r.append(int(i*times))
    return r

In [0]:
def trim(snd_data):
    "Trim the blank spots at the start and end"
    def _trim(snd_data):
        snd_started = False
        r = array('h')

        for i in snd_data:
            if not snd_started and abs(i)>THRESHOLD:
                snd_started = True
                r.append(i)

            elif snd_started:
                r.append(i)
        return r

    # Trim to the left
    snd_data = _trim(snd_data)

    # Trim to the right
    snd_data.reverse()
    snd_data = _trim(snd_data)
    snd_data.reverse()
    return snd_data

In [0]:
def add_silence(snd_data, seconds):
    "Add silence to the start and end of 'snd_data' of length 'seconds' (float)"
    r = array('h', [0 for i in range(int(seconds*RATE))])
    r.extend(snd_data)
    r.extend([0 for i in range(int(seconds*RATE))])
    return r

In [0]:
def record():
    """
    Record a word or words from the microphone and 
    return the data as an array of signed shorts.
    Normalizes the audio, trims silence from the 
    start and end, and pads with 0.5 seconds of 
    blank sound to make sure VLC et al can play 
    it without getting chopped off.
    """
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=1, rate=RATE,
        input=True, output=True,
        frames_per_buffer=CHUNK_SIZE)

    num_silent = 0
    snd_started = False

    r = array('h')

    while 1:
        # little endian, signed short
        snd_data = array('h', stream.read(CHUNK_SIZE))
        if byteorder == 'big':
            snd_data.byteswap()
        r.extend(snd_data)

        silent = is_silent(snd_data)

        if silent and snd_started:
            num_silent += 1
        elif not silent and not snd_started:
            snd_started = True

        if snd_started and num_silent > SILENCE:
            break

    sample_width = p.get_sample_size(FORMAT)
    stream.stop_stream()
    stream.close()
    p.terminate()

    r = normalize(r)
    r = trim(r)
    r = add_silence(r, 0.5)
    return sample_width, r

In [0]:
def record_to_file(path):
    "Records from the microphone and outputs the resulting data to 'path'"
    sample_width, data = record()
    data = pack('<' + ('h'*len(data)), *data)

    wf = wave.open(path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(sample_width)
    wf.setframerate(RATE)
    wf.writeframes(data)
    wf.close()

In [44]:
pip install speechrecognition

Collecting speechrecognition
[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)
[K     |████████████████████████████████| 32.8MB 93kB/s 
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.8.1


In [46]:
import speech_recognition as sr
print("hey")
for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print("hey")
    print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))
    print("i'M OKAY")

hey


In [86]:
pip install pyalsaaudio

Collecting pyalsaaudio
[?25l  Downloading https://files.pythonhosted.org/packages/52/b6/44871791929d9d7e11325af0b7be711388dfeeab17147988f044a41a6d83/pyalsaaudio-0.8.4.tar.gz (315kB)
[K     |████████████████████████████████| 317kB 2.7MB/s 
[?25hBuilding wheels for collected packages: pyalsaaudio
  Building wheel for pyalsaaudio (setup.py) ... [?25l[?25hdone
  Created wheel for pyalsaaudio: filename=pyalsaaudio-0.8.4-cp36-cp36m-linux_x86_64.whl size=57826 sha256=6210b5ef48c5c54f7ed1ea62cef394c54ed34126cbb7a71071125aa1b605c496
  Stored in directory: /root/.cache/pip/wheels/90/74/b8/362c8d8e9fefe2fc2a31881c272053eccae6fd22ecd461f672
Successfully built pyalsaaudio
Installing collected packages: pyalsaaudio
Successfully installed pyalsaaudio-0.8.4


In [0]:
from ctypes import *
from contextlib import contextmanager
import pyaudio

ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p)

def py_error_handler(filename, line, function, err, fmt):
    pass

c_error_handler = ERROR_HANDLER_FUNC(py_error_handler)

@contextmanager
def noalsaerr():
    asound = cdll.LoadLibrary('libasound.so')
    asound.snd_lib_error_set_handler(c_error_handler)
    yield
    asound.snd_lib_error_set_handler(None)

In [50]:
pip install playsound pyaudio pydub ffmpeg-python

Collecting playsound
  Downloading https://files.pythonhosted.org/packages/f5/16/10d897b0a83fb4b05b03a63d7a2667ab75f857f67f7062fd447dd3f49bf7/playsound-1.2.2-py2.py3-none-any.whl
Collecting pydub
  Downloading https://files.pythonhosted.org/packages/79/db/eaf620b73a1eec3c8c6f8f5b0b236a50f9da88ad57802154b7ba7664d0b8/pydub-0.23.1-py2.py3-none-any.whl
Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: playsound, pydub, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 playsound-1.2.2 pydub-0.23.1


In [67]:
!pip install -q https://github.com/pyannote/pyannote-audio/tarball/develop

  Building wheel for pyannote.audio (setup.py) ... [?25l[?25hdone
[31mERROR: chainer 6.5.0 has requirement typing-extensions<=3.6.6, but you'll have typing-extensions 3.7.4.2 which is incompatible.[0m


In [66]:
pip install typing-extensions==3.6.6

Collecting typing-extensions==3.6.6
  Downloading https://files.pythonhosted.org/packages/62/4f/392a1fa2873e646f5990eb6f956e662d8a235ab474450c72487745f67276/typing_extensions-3.6.6-py3-none-any.whl
[31mERROR: pyannote-core 3.7.1 has requirement typing-extensions>=3.7.4.1, but you'll have typing-extensions 3.6.6 which is incompatible.[0m
[31mERROR: pyannote-audio 0+unknown has requirement typing-extensions>=3.7.4, but you'll have typing-extensions 3.6.6 which is incompatible.[0m
Installing collected packages: typing-extensions
  Found existing installation: typing-extensions 3.7.4.2
    Uninstalling typing-extensions-3.7.4.2:
      Successfully uninstalled typing-extensions-3.7.4.2
Successfully installed typing-extensions-3.6.6


In [74]:
from pyannote.audio.features import RawAudio
from IPython.display import Audio
from scipy.io import wavfile
import wave
from scipy.io import wavfile
from pyannote.core import Segment, notebook
if __name__ == "__main__":
    
    
    # make notebook visualization zoom on 600s < t < 660s time range
    EXCERPT = Segment(0, 5)
    # load the saved model (after training)
    model = pickle.load(open("result/mlp_classifier.model", "rb"))
    print("Please talk")
    filename = "test.wav"
    print("Recording...")
    audio, sr = get_audio()
    

# load audio waveform, crop excerpt, and play it
    DEMO_FILE = {'uri': 'output.wav', 'audio': audio}
    waveform = RawAudio(sample_rate=16000).crop(DEMO_FILE,EXCERPT)
    # Audio(data=waveform.squeeze(), rate=16000, autoplay=True)
    demo_new = wavfile.write('result.wav',16000,waveform)
    Audio(data=waveform.squeeze,filename ='result.wav',rate=16000,autoplay=True)
    print(demo_new)
    
# THRESHOLD = 500
# CHUNK_SIZE = 1024
# FORMAT = pyaudio.paInt16
# RATE = 16000
# SILENCE = 30
    # channels =1
    # record_seconds =5
    # with noalsaerr():
    #   p= pyaudio.PyAudio()
    
    # print(p.get_default_input_device_info())
    # print(p.get_device_count())
    # stream = p.open(format=FORMAT,
    #             channels=channels,
    #             rate=RATE,
    #             input=True,
    #             output=True,
    #             frames_per_buffer=CHUNK_SIZE)
    # stream.read()
    # frames = []
    # print("Recording...")
    # for i in range(int(44100 / chunk * record_seconds)):
    #   data = stream.read(chunk)
    #   # if you want to hear your voice while recording
    #   # stream.write(data)
    #   frames.append(data)
    # print("Finished recording.")
    # # stop and close stream
    # stream.stop_stream()
    # stream.close()
    # # terminate pyaudio object
    # p.terminate()
    # # save audio file
    # # open the file in 'write bytes' mode
    # wf = wave.open(filename, "wb")
    # # set the channels
    # wf.setnchannels(channels)
    # # set the sample format
    # wf.setsampwidth(p.get_sample_size(FORMAT))
    # # set the sample rate
    # wf.setframerate(sample_rate)
    # # write the frames as bytes
    # wf.writeframes(b"".join(frames))
    # # close the file
    # wf.close()














    # # record the file (start talking)
    # record_to_file(filename)
    # # extract features and reshape it
    # features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)
    # # predict
    # result = model.predict(features)[0]
    # # show the result !
    # print("result:", result)

Please talk
Recording...


TypeError: ignored

In [57]:
!pip install ffmpeg-python



In [0]:
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [59]:
audio, sr = get_audio()