#**Source code to detect emotion based on audio clip** 
Course Project for ME781 \\
Submitted by : Team 8 \\
###Team Members : Priyanka, Kartikey, Jasleen, Jigmat, Neha

Submitted on : 29-11-2022

In [None]:
# Import Libraries
import glob
import os

import numpy as np
import soundfile
import librosa

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

!pip install soundfile

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install gradio

In [None]:
# Import data from google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True) 

Mounted at /content/gdrive


In [None]:
# Extract features from audio file
# Features considered here: MFCC, Chroma, MEL Spectrogram Frequency, Contrast, Tonnetz
# These features are the dataset for machine learning model

def extract_feature(file_name, **kwargs):
    
    chroma = kwargs.get("chroma")
    contrast = kwargs.get("contrast")
    mfcc = kwargs.get("mfcc")
    mel = kwargs.get("mel")
    tonnetz = kwargs.get("tonnetz")
    
    with soundfile.SoundFile(file_name) as audio_clip:
        X = audio_clip.read(dtype="float32")
        sound_fourier = np.abs(librosa.stft(X))   # Conducting short time fourier transform of audio clip
        result = np.array([])
        
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=audio_clip.samplerate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=sound_fourier, sr=audio_clip.samplerate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=audio_clip.samplerate).T, axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=sound_fourier, sr=audio_clip.samplerate).T, axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=audio_clip.samplerate).T, axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [None]:
# Emotion lookup list 
emotion_directory = {"01": "neutral","02": "calm","03": "happy","04": "sad","05": "angry","06": "fearful","07": "disgust","08": "surprised"} # all emotions in RAVDESS dataset
emotion_allow = {"angry","sad","neutral","happy"} # Emotions considered for present model

# Function to load entire data and split into datasets for training and testing the model
def load_data(test_size=0.25):
    X, y = [], []
    try :
      for file in glob.glob("/content/gdrive/MyDrive/ME781_Project/Dataset/Actor_*/*.wav"):
          clipname = os.path.basename(file) # get audio file name
          emotion = emotion_directory[clipname.split("-")[2]] # get label of emotion
          
          if emotion not in emotion_allow:
              continue
          # Now only the audio clips with allowed emotions are filtered
          features = extract_feature(file, mfcc=True, chroma=True, mel=True, contrast=True, tonnetz=True) # extract speech features
          # Append to dataset
          X.append(features)
          y.append(emotion)
    except :
         print('FAIL')
         pass
    
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7) # return training and testing data 

In [None]:
# Loading Dataset
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

print("Number of features:", X_train.shape[1])            # number of features used
print("Number of training samples = ", X_train.shape[0])  # number of samples in training data
print("Number of testing samples = ", X_test.shape[0])    # number of samples in testing data


Number of features: 193
Number of training samples =  1003
Number of testing samples =  335


Random Forest

In [None]:
classifier = RandomForestClassifier(n_estimators = 120, random_state = 0) 
  
# Model Training
classifier.fit(X_train, y_train)   
# Model Testing
Prediction = classifier.predict(X_test) 

# Printing Model Assessment Details
var_accuracy = accuracy_score(y_true=y_test,y_pred=Prediction)
var_confusion = confusion_matrix(y_test,Prediction)
print("Accuracy Score: ", var_accuracy)

print("Confusion Matrix: ")
print(var_confusion)

print("Classification Report:")
print(classification_report(y_test,Prediction)) 


Accuracy Score:  0.7850746268656716
Confusion Matrix: 
[[84 16  0  4]
 [13 78  0  5]
 [ 0  3 32 15]
 [ 3 11  2 69]]
Classification Report:
              precision    recall  f1-score   support

       angry       0.84      0.81      0.82       104
       happy       0.72      0.81      0.76        96
     neutral       0.94      0.64      0.76        50
         sad       0.74      0.81      0.78        85

    accuracy                           0.79       335
   macro avg       0.81      0.77      0.78       335
weighted avg       0.80      0.79      0.79       335



Support Vector Machine

In [None]:
from sklearn.svm import SVC 

# Model Training
fitted_model = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)  
# Model Testing
Prediction = fitted_model.predict(X_test)

# Printing Model Assessment Details
var_accuracy = accuracy_score(y_true=y_test,y_pred=Prediction)
var_confusion = confusion_matrix(y_test,Prediction)
print("Accuracy Score: ", var_accuracy)

print("Confusion Matrix: ")
print(var_confusion)

print("Classification Report:")
print(classification_report(y_test,Prediction)) 

Accuracy Score:  0.7373134328358208
Confusion Matrix: 
[[83 15  1  5]
 [ 9 79  3  5]
 [ 3  6 24 17]
 [ 4 11  9 61]]
Classification Report:
              precision    recall  f1-score   support

       angry       0.84      0.80      0.82       104
       happy       0.71      0.82      0.76        96
     neutral       0.65      0.48      0.55        50
         sad       0.69      0.72      0.71        85

    accuracy                           0.74       335
   macro avg       0.72      0.70      0.71       335
weighted avg       0.74      0.74      0.73       335



Decision Tree based Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier 

# Model Training
fitted_model = DecisionTreeClassifier(max_depth = 7,random_state=0).fit(X_train, y_train)  
# Model Testing
Prediction = fitted_model.predict(X_test) 

# Printing Model Assessment Details
var_accuracy = accuracy_score(y_true=y_test,y_pred=Prediction)
var_confusion = confusion_matrix(y_test,Prediction)
print("Accuracy Score: ", var_accuracy)

print("Confusion Matrix: ")
print(var_confusion)

print("Classification Report:")
print(classification_report(y_test,Prediction))

Accuracy Score:  0.6208955223880597
Confusion Matrix: 
[[76 19  3  6]
 [24 55  6 11]
 [ 1  4 26 19]
 [11  8 15 51]]
Classification Report:
              precision    recall  f1-score   support

       angry       0.68      0.73      0.70       104
       happy       0.64      0.57      0.60        96
     neutral       0.52      0.52      0.52        50
         sad       0.59      0.60      0.59        85

    accuracy                           0.62       335
   macro avg       0.61      0.61      0.61       335
weighted avg       0.62      0.62      0.62       335



# Deployed App using Gradio

The best working algorithm, i.e., Random Forest Classifier was used to deploy the trained model for classifying any inputted audio.

In [None]:
import gradio as gr
import wave
import matplotlib.pyplot as plt

def emotion_predict(input):
  input_features = extract_feature(input, mfcc=True, chroma=True, mel=True, contrast=True, tonnetz=True)
  rf_prediction = classifier.predict(input_features.reshape(1,-1))
  if rf_prediction == 'happy':
    return 'Happy 😎'
  elif rf_prediction == 'neutral':
    return 'Neutral 😐'
  elif rf_prediction == 'sad':
    return 'Sad 😢'
  else:
    return 'Angry 😤'
  

def plot_fig(input):
  wav = wave.open(input, 'r')

  raw = wav.readframes(-1)
  raw = np.frombuffer(raw, "int16")
  sampleRate = wav.getframerate()

  Time = np.linspace(0, len(raw)/sampleRate, num=len(raw))

  fig = plt.figure()

  plt.rcParams["figure.figsize"] = (50,15)

  plt.title("Waveform Of the Audio", fontsize=25)

  plt.xticks(fontsize=15)

  plt.yticks(fontsize=15)

  plt.ylabel("Amplitude", fontsize=25)

  plt.plot(Time, raw, color='red')

  return fig


with gr.Blocks() as app:
  gr.Markdown(
        """
    # Speech Emotion Detector 🎵😍
    This application classifies inputted audio 🔊 according to the verbal emotion into four categories:
    1. Happy 😎
    2. Neutral 😐
    3. Sad 😢
    4. Angry 😤
    """
  )
  with gr.Tab("Record Audio"):
    record_input = gr.Audio(source="microphone", type="filepath")
        
    with gr.Accordion("Audio Visualization", open=False):
      gr.Markdown(
          """
      ### Visualization will work only after Audio has been submitted
      """
      )    
      plot_record = gr.Button("Display Audio Signal")
      plot_record_c = gr.Plot(label='Waveform Of the Audio')
    
    record_button = gr.Button("Detect Emotion")
    record_output = gr.Text(label = 'Emotion Detected')

  with gr.Tab("Upload Audio File"):
    gr.Markdown(
        """
    ## Uploaded Audio should be of .wav format
    """
    )

    upload_input = gr.Audio(type="filepath")

    with gr.Accordion("Audio Visualization", open=False):
      gr.Markdown(
          """
      ### Visualization will work only after Audio has been submitted
      """
      )
      plot_upload = gr.Button("Display Audio Signal")
      plot_upload_c = gr.Plot(label='Waveform Of the Audio')

    upload_button = gr.Button("Detect Emotion")
    upload_output = gr.Text(label = 'Emotion Detected')
    
  record_button.click(emotion_predict, inputs=record_input, outputs=record_output)
  upload_button.click(emotion_predict, inputs=upload_input, outputs=upload_output)
  plot_record.click(plot_fig, inputs=record_input, outputs=plot_record_c)
  plot_upload.click(plot_fig, inputs=upload_input, outputs=plot_upload_c)

app.launch()

### The same app was deployed into a permanent hosting site on this link:

https://huggingface.co/spaces/workspace/Speech-Emotion-Detector