<a href="https://colab.research.google.com/github/Jyothika2406/speech-emotion-Recognition/blob/main/Speech_Emotion_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import librosa
import soundfile
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical



In [None]:
import os
Root = "/content/drive/MyDrive/speech-emotion-recognition-ravdess-data"
os.chdir(Root)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ls

[0m[01;34mActor_01[0m/  [01;34mActor_04[0m/  [01;34mActor_07[0m/  [01;34mActor_10[0m/  [01;34mActor_13[0m/  [01;34mActor_16[0m/  [01;34mActor_19[0m/  [01;34mActor_22[0m/
[01;34mActor_02[0m/  [01;34mActor_05[0m/  [01;34mActor_08[0m/  [01;34mActor_11[0m/  [01;34mActor_14[0m/  [01;34mActor_17[0m/  [01;34mActor_20[0m/  [01;34mActor_23[0m/
[01;34mActor_03[0m/  [01;34mActor_06[0m/  [01;34mActor_09[0m/  [01;34mActor_12[0m/  [01;34mActor_15[0m/  [01;34mActor_18[0m/  [01;34mActor_21[0m/  [01;34mActor_24[0m/


In [None]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
#Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result=np.hstack((result, mel))
    return result

In [None]:
# Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [None]:
#Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("/content/drive/MyDrive/speech-emotion-recognition-ravdess-data/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [None]:
#Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [None]:
x_train

array([[-6.55448364e+02,  5.97599335e+01,  1.32214022e+01, ...,
         2.52298167e-04,  2.12290252e-04,  1.47128798e-04],
       [-5.73134216e+02,  3.18485775e+01, -8.54380703e+00, ...,
         1.03158131e-03,  6.40605518e-04,  2.01104325e-04],
       [-6.30929138e+02,  4.73286591e+01, -1.07452383e+01, ...,
         7.66765152e-06,  6.40912731e-06,  4.30359569e-06],
       ...,
       [-5.54125488e+02,  4.68293228e+01, -6.71562970e-01, ...,
         6.37523553e-05,  2.96597063e-05,  1.21428966e-05],
       [-6.46940125e+02,  4.78219986e+01,  2.81606340e+00, ...,
         1.75995883e-05,  1.19097103e-05,  4.78700076e-06],
       [-5.53571289e+02,  3.47704163e+01, -2.40211463e+00, ...,
         6.17825834e-04,  3.78832250e-04,  1.81637995e-04]])

In [None]:
#Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [None]:
#Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [None]:
#Initialize the Multi Layer Perceptron Classifier
model = MLPClassifier(alpha=0.001, batch_size=128, epsilon=1e-08, hidden_layer_sizes=(500, 300), activation='tanh', learning_rate='adaptive', max_iter=1000)

In [None]:
#Train the model
model.fit(x_train,y_train)

In [None]:
#Predict for the test set
y_pred=model.predict(x_test)

In [None]:
y_pred

array(['fearful', 'fearful', 'calm', 'fearful', 'fearful', 'calm',
       'fearful', 'fearful', 'disgust', 'happy', 'happy', 'fearful',
       'calm', 'happy', 'calm', 'disgust', 'calm', 'happy', 'calm',
       'fearful', 'calm', 'fearful', 'disgust', 'disgust', 'fearful',
       'calm', 'happy', 'calm', 'disgust', 'happy', 'fearful', 'fearful',
       'happy', 'calm', 'happy', 'disgust', 'happy', 'calm', 'calm',
       'disgust', 'fearful', 'happy', 'calm', 'happy', 'fearful', 'calm',
       'disgust', 'disgust', 'calm', 'fearful', 'fearful', 'fearful',
       'disgust', 'happy', 'fearful', 'calm', 'happy', 'calm', 'happy',
       'disgust', 'fearful', 'calm', 'happy', 'disgust', 'happy', 'calm',
       'calm', 'fearful', 'disgust', 'disgust', 'happy', 'calm', 'calm',
       'fearful', 'happy', 'happy', 'calm', 'calm', 'disgust', 'calm',
       'disgust', 'disgust', 'fearful', 'calm', 'disgust', 'happy',
       'disgust', 'disgust', 'fearful', 'disgust', 'calm', 'fearful',
       'fea

In [None]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 79.17%


In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
f1_score(y_test, y_pred,average=None)

array([0.74242424, 0.36666667, 0.69306931, 0.59340659])

In [None]:
import pandas as pd
df=pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
df.head(20)

Unnamed: 0,Actual,Predicted
0,fearful,happy
1,fearful,happy
2,calm,calm
3,happy,fearful
4,disgust,fearful
5,calm,calm
6,fearful,fearful
7,happy,calm
8,disgust,disgust
9,calm,calm


In [None]:
import pickle
# Writing different model files to file
with open( 'ModelForPrediction.sav', 'wb') as f:
    pickle.dump(model,f)

In [None]:
filename = 'ModelForPrediction.sav'
loaded_model = pickle.load(open(filename, 'rb')) # loading the model file from the storage

feature=extract_feature("/content/drive/MyDrive/speech-emotion-recognition-ravdess-data/Actor_01/03-01-01-01-01-01-01.wav", mfcc=True, chroma=True, mel=True)

feature=feature.reshape(1,-1)

prediction=loaded_model.predict(feature)
prediction

array(['fearful'], dtype='<U7')