# Q2. Extract the MFCC features with 40 components from those 500 audios and build simple logistic regression. Comment on the model output performance(Try to adjust the MFCC components).

In [7]:
import pandas as pd
import numpy as np
import soundfile
import librosa
import os, glob, pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [8]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-08-01-01-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-01-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-07-02-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-07-01-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-02-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-06-02-02-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-04-01-02-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-01-01-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-02-01-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-03-01-01-01-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-07-01-02-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_02/03-01-05-01-01-02-02.wav
/kaggle/input/ravdess-emotional-speech-audio/Actor_0

Extracting Features

In [9]:
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    #chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    #result=np.hstack((result, chroma))
    #mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate,n_mels=128).T,axis=0)
    #result=np.hstack((result, mel))
    return result

Dictionary of emotions

In [10]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

def gender(g):
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'

Function to load the data

In [11]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in tqdm(glob.glob("../input/ravdess-emotional-speech-audio/Actor_*/*.wav")):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=1)

Splitting data into train and test

In [12]:
X_train, X_val, y_train, y_val = load_data()

100%|██████████| 1440/1440 [08:15<00:00,  2.91it/s]


Shapes of the entities

In [13]:
print((X_train.shape[0], X_val.shape[0]))

(1152, 288)


Scaling the data

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

Features extracted

In [15]:
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 40


Model Selection

In [16]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression( max_iter = 500)
clf.fit(X_train,y_train)
print(clf.score(X_train, y_train))

0.7265625


Make predictions on validation set

In [17]:
y_pred=clf.predict(X_val)
print(clf.score(X_val, y_val))
mfcc_pred_acc=clf.score(X_val, y_val)

0.5868055555555556


## As the training accuracy is much higher than prediction accuracy it is evident that model is overfitting. 

Comparing side-by-side with actual

In [18]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
0,fearful_female,sad_female
1,happy_male,happy_male
2,fearful_male,fearful_male
3,surprised_female,surprised_female
4,angry_female,angry_female
...,...,...
283,fearful_male,disgust_male
284,angry_female,angry_female
285,happy_male,happy_male
286,happy_female,disgust_female


# Q3. Extract the mel spectrogram features with 128 bands from those 500 audios and build a simple logistic regression. Comment on the model output performance.

In [19]:
def extract_feature_mel(file_name):
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    #mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    #result=np.hstack((result, mfccs))
    #chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    #result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate,n_mels=128).T,axis=0)
    result=np.hstack((result, mel))
    return result

In [20]:
def load_data_mel(test_size=0.3):
    x,y=[],[]
    for file in tqdm(glob.glob("../input/ravdess-emotional-speech-audio/Actor_*/*.wav")):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature_mel(file)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=1)

In [21]:
X_train, X_val, y_train, y_val = load_data_mel()

  0.000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
  3.7270675e-07  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
  1.5924206e-07  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
 -3.9743118e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  if __name__ == "__main__":
  9.2822061e-09  0.0000000e+00] as keyword args. From version 0.10 passing these as positional argum

In [22]:
print((X_train.shape[0], X_val.shape[0]))

(1008, 432)


In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [24]:
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 128


In [25]:
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression( max_iter = 500)
clf.fit(X_train,y_train)
print(clf.score(X_train, y_train))

0.49107142857142855


In [26]:
y_pred=clf.predict(X_val)
print(clf.score(X_val, y_val))
mel_pred_acc=clf.score(X_val, y_val)

0.3125


## ## As the training accuracy is much higher than prediction accuracy it is evident that model is overfitting. 

In [27]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
0,fearful_female,calm_female
1,happy_male,calm_female
2,fearful_male,disgust_male
3,surprised_female,surprised_female
4,angry_female,angry_female
...,...,...
427,sad_female,calm_female
428,surprised_female,surprised_female
429,fearful_female,surprised_female
430,angry_female,sad_female


In [29]:
print(f"\n Prediction Accuracy of MFCC:{mfcc_pred_acc} \n Prediction Accuracy of Mel 128 Bands: {mel_pred_acc}")


 Prediction Accuracy of MFCC:0.5868055555555556 
 Prediction Accuracy of Mel 128 Bands: 0.3125


# Q4. Compare model results of MFCC and Mel Spectrogram

**Looking at the above prediction accuracies, it is clearly evident that MFCC outperforms the mel spectrogram with 128 bands.
**MFCC is a combination of short term power spectrum of any sound so clarity leads to a better perfomance when compared to Mel spectrogram.**
**The MFCC is a bit more decorrelarated, which can be beneficial with linear models like Gaussian Mixture Models.**