<a href="https://colab.research.google.com/github/Iasonaspg/ser-repo/blob/main/SER_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!ls  # Check if required cuda 9.0 amd64-deb file is downloaded
!dpkg -i cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!ls /var/cuda-repo-9-0-local | grep .pub
!apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
!apt-get update
!sudo apt-get install cuda-9.0

In [18]:
!pip install thundersvm



In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!python -m pip install -U pip
!pip uninstall librosa
!pip install librosa

import numpy as np
import scipy
import librosa
import matplotlib.pyplot as plt
from IPython.display import Audio, display

In [None]:
print(librosa.__version__)

0.8.0


In [4]:
import os
os.chdir('/content/drive/My Drive/Emotions/')
!ls 

anger						      fear
Br_CSV						      feature_test_vector.csv
cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb    feature_train_vector.csv
cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb.1  happiness
disgust						      sadness


In [5]:
# load audio files

from pathlib import Path

# define STFT global parameters
fs = 22050
n_fft, hop = 1024, 256

# Fucntions require you to be in the root data directory
def get_train_wav_dir(dir_name,fs=44100):
  return [ librosa.load(p,fs)[0] for p in Path().glob('./' + dir_name + '/train' + '/*.wav') ]

def get_test_wav_dir(dir_name,fs=44100):
  return [ librosa.load(p,fs)[0] for p in Path().glob('./' + dir_name + '/test' + '/*.wav') ]



emotions = ["sadness","happiness","anger","fear","disgust"]

# list of len(emotions) that contains lists of numpy arrays. Each numpy array belongs to a single wav file
train_wav = []
test_wav = []
for emotion in emotions:
  train_wav.append(get_train_wav_dir(emotion,fs))
  test_wav.append(get_test_wav_dir(emotion,fs))




In [6]:
# Read brightness extracted features that were saved in csv files

from numpy import genfromtxt

os.chdir('/content/drive/My Drive/Emotions/Br_CSV/')

# Create two lists that contain len(emotions) numpy arrays. Each row of the arrays represents the brightness x% of an entire emotion for the total of the emotion wav files, divided in frames
br_train = []
br_test = []
for emotion in emotions:
  filename = 'br_' + emotion + '_train.csv'
  temp = genfromtxt(filename,delimiter=',')
  np.nan_to_num(temp,copy=False)
  br_train.append(temp)
  filename = 'br_' + emotion + '_test.csv'
  temp = genfromtxt(filename,delimiter=',')
  np.nan_to_num(temp,copy=False)
  br_test.append(temp)



print(br_train[0].shape)
print(br_train[1].shape)
print(br_train[2].shape)
print(br_train[3].shape)
print(br_train[4].shape)


(2, 32013)
(2, 28208)
(2, 25665)
(2, 27131)
(2, 34553)


In [7]:
# Feature extraction of audio files

from librosa.feature import spectral

# Gets a list of wav signals and returns a numpy array of features x nFrames_total and a list of len(input) containing the nFrames per signal
def extract_features(wav):
  spectral_centroid = spectral.spectral_centroid(wav[0], sr=fs, n_fft=n_fft, hop_length=hop, center=False)
  spectral_rolloff = spectral.spectral_rolloff(wav[0], sr=fs, n_fft=n_fft, hop_length=hop,roll_percent=0.30, center=False)
  spectral_rolloff50 = spectral.spectral_rolloff(wav[0], sr=fs, n_fft=n_fft, hop_length=hop,roll_percent=0.50, center=False)
  zero_crossing_rate = spectral.zero_crossing_rate(wav[0], frame_length=n_fft, hop_length=hop, center=False)

  mfcc = librosa.feature.mfcc(wav[0], sr=fs, n_fft=n_fft, hop_length=hop, n_mfcc=7, center=False)
  desired_mfcc = np.concatenate((mfcc[1:2,:],mfcc[3:5,:],mfcc[6:7,:]),axis=0)
  rms = librosa.feature.rms(wav[0],frame_length=n_fft,hop_length=hop,center=False)
  # mfcc_delta = librosa.feature.delta(desired_mfcc, order=1, mode='nearest')
  # mfcc_delta2 = librosa.feature.delta(desired_mfcc, order=2, mode='nearest')
  feature_vector = np.concatenate((spectral_centroid,spectral_rolloff,spectral_rolloff50,zero_crossing_rate,desired_mfcc,rms),axis=0)
  nFrames = [0 for i in range(len(wav))]
  nFrames[0] = spectral_centroid.shape[1]
  for i in range(1,len(wav)):
    cols = wav[i].shape[0]
    spectral_centroid = spectral.spectral_centroid(wav[i], sr=fs, n_fft=n_fft, hop_length=hop, center=False)
    spectral_rolloff = spectral.spectral_rolloff(wav[i], sr=fs, n_fft=n_fft, hop_length=hop,roll_percent=0.30, center=False)
    spectral_rolloff50 = spectral.spectral_rolloff(wav[i], sr=fs, n_fft=n_fft, hop_length=hop,roll_percent=0.50, center=False)

    zero_crossing_rate = spectral.zero_crossing_rate(wav[i], frame_length=n_fft, hop_length=hop, center=False)
    mfcc = librosa.feature.mfcc(wav[i], sr=fs, n_fft=n_fft, hop_length=hop, n_mfcc=7, center=False)
    desired_mfcc = np.concatenate((mfcc[1:2,:],mfcc[3:5,:],mfcc[6:7,:]),axis=0)
    rms = librosa.feature.rms(wav[i],frame_length=n_fft,hop_length=hop,center=False)
    # mfcc_delta = librosa.feature.delta(desired_mfcc, order=1, mode='nearest')
    # mfcc_delta2 = librosa.feature.delta(desired_mfcc, order=2, mode='nearest')
    feature_vector1 = np.concatenate((spectral_centroid,spectral_rolloff,spectral_rolloff50,zero_crossing_rate,desired_mfcc,rms),axis=0)
    feature_vector = np.concatenate((feature_vector,feature_vector1),axis=1)
    nFrames[i] = spectral_centroid.shape[1]
  return feature_vector, nFrames

# Create the feature matrix while working emotion by emotion. In each loop all the samples of the same emotion are used for feature extraction
test_nframes = []
train_nframes = []
feature_train_vector, _ = extract_features(train_wav[0])
label_vec = [0 for i in range(feature_train_vector.shape[1])]
feature_train_vector = np.concatenate((feature_train_vector,br_train[0],np.array([label_vec])),axis=0)

feature_test_vector, test_nframe = extract_features(test_wav[0])
label_vec = [0 for i in range(feature_test_vector.shape[1])]
feature_test_vector = np.concatenate((feature_test_vector,br_test[0],np.array([label_vec])),axis=0)
test_nframes.append(test_nframe)
for i in range(1,len(train_wav)):
  train_feat_vec, train_nframe = extract_features(train_wav[i])
  train_nframes.append(train_nframe)
  label_vec = [i for j in range(train_feat_vec.shape[1])]
  train_feat_vec = np.concatenate((train_feat_vec,br_train[i],np.array([label_vec])),axis=0)
  feature_train_vector = np.concatenate((feature_train_vector,train_feat_vec),axis=1)

  
  test_feat_vec, test_nframe = extract_features(test_wav[i])
  test_nframes.append(test_nframe)
  label_vec = [i for j in range(test_feat_vec.shape[1])]
  test_feat_vec = np.concatenate((test_feat_vec,br_test[i],np.array([label_vec])),axis=0)
  feature_test_vector = np.concatenate((feature_test_vector,test_feat_vec),axis=1)


print(feature_test_vector.shape)
#print(happy_train_feat_vec.shape)
print('=')
print(feature_train_vector.shape)
#print(feature_vector[:,0:2])

(12, 25205)
=
(12, 147570)


In [None]:
# Create some spectrograms
import librosa.display
os.chdir('/content/drive/My Drive/Emotions/')
y,_ = librosa.load("./sadness/train/trimmed_s01 (6).wav",fs)
y1,_ = librosa.load("./happiness/train/trimmed_h01 (6).wav",fs)

# gets wav time series as nparray and emotion name as string and creates a spectrogram
def get_spectr(wav,em):
  S, phase = librosa.magphase(librosa.stft(y=wav, n_fft=n_fft, hop_length=hop, center=False))
  print(S.shape)
  cent = spectral.spectral_centroid(S=S)
  rol30 = spectral.spectral_rolloff(S=S, sr=fs,roll_percent=0.30)
  rol50 = spectral.spectral_rolloff(S=S, sr=fs,roll_percent=0.50)

  times = librosa.times_like(cent)
  times30 = librosa.times_like(rol30)
  times50 = librosa.times_like(rol50)
  fig, ax = plt.subplots(figsize=(10,7))
  img = librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.min), y_axis='log', x_axis='time', ax=ax)
  ax.plot(times, cent.T, label='Spectral centroid', color='w')
  ax.plot(times30, rol30.T, label='Spectral rolloff 30', color='#95eddf')
  ax.plot(times50, rol50.T, label='Spectral rolloff 50', color='b')
  fig.colorbar(img, ax=ax, format="%+2.f dB")
  ax.legend(loc='upper right')
  ax.set(title= em + ' log Power spectrogram')
  plt.show()

get_spectr(y,"sad")
get_spectr(y1,"happy")

In [8]:
# Statistical description of data
import pandas as pd
feat_train = feature_train_vector[0:11,:].T

pd_feat_train = pd.DataFrame(feat_train)
pd_feat_train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0
mean,2466.849859,1094.105888,1796.321196,0.13534,69.904337,8.923558,-5.913831,-12.793342,0.074399,0.840143,0.667034
std,1568.838165,1447.979581,1888.746918,0.147026,63.95129,29.482764,27.270888,20.12815,0.060985,0.199235,0.237168
min,26.587444,0.0,0.0,0.0,-235.525085,-107.76487,-149.226013,-100.508789,8.1e-05,0.0,0.0
25%,1408.982815,322.998047,538.330078,0.045898,37.315628,-10.931899,-22.989209,-25.982828,0.025422,0.77551,0.51232
50%,1956.965492,495.263672,990.527344,0.076172,79.371826,7.661649,-5.786995,-11.890461,0.06168,0.90774,0.697565
75%,2956.028462,968.994141,2217.919922,0.150391,113.570723,27.577981,11.082222,1.120398,0.108945,0.98008,0.864457
max,8398.838795,8161.083984,9259.277344,0.805664,251.672791,170.622467,115.711357,81.680939,0.427108,0.99995,0.99992


In [None]:
# Save features as csv file
a = np.asarray(feature_train_vector)
np.savetxt("feature_train_vector.csv", a, delimiter=",")

a = np.asarray(feature_test_vector)
np.savetxt("feature_test_vector.csv", a, delimiter=",")

In [9]:
# Normalize, standardize and conduct PCA

from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

X_train = feat_train
Y_train = feature_train_vector[11,:].T

X_test = feature_test_vector[0:11,:].T
Y_test = feature_test_vector[11,:].T

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaler = Normalizer()
scaler.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)

pd_feat_train_norm = pd.DataFrame(X_train_norm)

pca = decomposition.PCA(n_components=11)
pca_train = pca.fit_transform(X_train_scaled)

pd_feat_train_scal = pd.DataFrame(X_train_scaled)
pd_feat_train_scal.describe()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0,147570.0
mean,-2.869991e-16,-6.434622e-16,4.729865e-16,-5.696105e-16,-5.230951e-16,-8.887807e-16,-7.677087e-17,-8.388868e-16,-9.458826e-17,-1.869193e-15,-4.68924e-15
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-1.555464,-0.7556112,-0.9510683,-0.9205255,-4.775986,-3.957866,-5.255152,-4.357864,-1.218636,-4.21685,-2.812513
25%,-0.6743019,-0.5325424,-0.6660476,-0.6083447,-0.5095881,-0.6734621,-0.6261415,-0.6552779,-0.8031023,-0.3244069,-0.6523449
50%,-0.3250087,-0.4135723,-0.4266303,-0.4024382,0.1480427,-0.04280175,0.004650964,0.04485676,-0.2085652,0.3392825,0.1287302
75%,0.3118105,-0.08640465,0.2232169,0.1023648,0.6828093,0.6327251,0.6232327,0.6912601,0.5664651,0.7023717,0.8324229
max,3.781148,4.880595,3.951287,4.559244,2.842305,5.484542,4.459906,4.693655,5.783569,0.8021033,1.403593


In [None]:
# Print explained variance

print(pca.explained_variance_ratio_,"\n")
#print(pca.explained_variance_)
print(abs( pca.components_ )[0],"\n")
print(abs( pca.components_ )[1],"\n")
print(abs( pca.components_ )[2],"\n")


[0.45840578 0.18285486 0.10734736 0.08733904 0.07297653 0.04679342
 0.01788306 0.01146488 0.00714047 0.00568712 0.00210748] 

[0.43135714 0.41790639 0.42857404 0.4202851  0.39750623 0.03552982
 0.07027021 0.0510116  0.1106557  0.18163777 0.25868778] 

[0.10097535 0.05969985 0.08862746 0.08859157 0.10942133 0.43934074
 0.43077967 0.15283736 0.34541637 0.49820341 0.43484918] 

[0.03662048 0.10306708 0.05314632 0.10428999 0.03417959 0.38416389
 0.52940227 0.62420826 0.08425853 0.32375252 0.20920597] 



In [None]:
# Use thundersvm SVM
from joblib import dump
from thundersvm import SVC
from sklearn.model_selection import GridSearchCV
import time

#clf = SVC(C=15.0,gamma=5.0,kernel='rbf')
#clf.fit(X_train_scaled, Y_train)

parameters = {'kernel':['rbf'], 'C':[1, 10, 20, 30], 'gamma':['auto',0.1,1,5,10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
str = time.time()
clf.fit(X_train_scaled, Y_train)
print(time.time()-str)

# dump(clf, './svm_gridsearch.joblib') 


In [None]:
clf.best_estimator_

In [None]:
from joblib import dump, load

clf = load('./svm_gridsearch.joblib')

In [None]:
# Use libsvm and save the model
from joblib import dump, load
from sklearn import svm
import time

clf = svm.SVC(C=10.0)
str = time.time()
clf.fit(X_train_scaled, Y_train)
print(time.time()-str)

dump(clf, './svm_c10.joblib') 

In [12]:
# Classification report using the mode(label) for each wav file
from sklearn import metrics
from statistics import mode, mean

def acc_mode(clf,X_test):
  accuracy_list = []
  y_pred = []
  y_true = []

  i = 0
  label = 0
  for k in test_nframes:
    for j in k:
      pred = clf.predict(X_test[i:i+j,:])
      try:
        val = mode(pred)
      except:
        val = round(mean(pred))
      y_pred.append(val)
      y_true.append(label)
      i = i + j
    label = label + 1

  # Print the precision and recall, among other metrics
  print(metrics.classification_report(y_true, y_pred, digits=3))


In [None]:
acc_mode(clf,X_test_scaled)

              precision    recall  f1-score   support

           0      0.486     0.944     0.642        18
           1      0.778     0.412     0.538        17
           2      0.786     0.611     0.688        18
           3      0.800     0.235     0.364        17
           4      0.520     0.722     0.605        18

    accuracy                          0.591        88
   macro avg      0.674     0.585     0.567        88
weighted avg      0.671     0.591     0.570        88



In [None]:
# Compute SVM mean accuracy score regarding whole wav files
# NOT USED
from sklearn.metrics import accuracy_score

accuracy_list = []

i = 0
for k in test_nframes:
  for j in k:
    pred = clf.predict(X_test_scaled[i:i+j,:])
    acc = accuracy_score(Y_test[i:i+j,],pred,True)
    accuracy_list.append(acc)
    i = i + j

print("Mean accuracy: ", np.mean(accuracy_list))

In [None]:
# SVM Classification report framewise
# NOT USED
from sklearn import metrics
pred = clf.predict(X_test_scaled)
print(metrics.classification_report(Y_test, pred, digits=3))

In [13]:
# Implement Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time

clf_lgr = LogisticRegression(C=10,random_state=0,solver='sag',max_iter=50)
str = time.time()
clf_lgr.fit(X_train_scaled, Y_train)
print("Log Reg train time",time.time()-str,"s\n")
clf_lgr.predict(X_test_scaled)

acc_mode(clf_lgr,X_test_scaled)

Log Reg train time 2.384178876876831 s

              precision    recall  f1-score   support

           0      0.533     0.889     0.667        18
           1      0.500     0.118     0.190        17
           2      0.526     0.556     0.541        18
           3      0.500     0.176     0.261        17
           4      0.517     0.833     0.638        18

    accuracy                          0.523        88
   macro avg      0.515     0.514     0.459        88
weighted avg      0.516     0.523     0.465        88



In [14]:
# Implement Decision Tree
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(criterion='entropy')
str = time.time()
dtree.fit(X_train_scaled, Y_train)
print("time: ",time.time()-str)
acc_mode(dtree,X_test_scaled)

time:  7.249677896499634
              precision    recall  f1-score   support

           0      0.548     0.944     0.694        18
           1      0.636     0.412     0.500        17
           2      0.737     0.778     0.757        18
           3      0.667     0.235     0.348        17
           4      0.571     0.667     0.615        18

    accuracy                          0.614        88
   macro avg      0.632     0.607     0.583        88
weighted avg      0.631     0.614     0.586        88



In [15]:
# Implement k-NN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_scaled,Y_train)

acc_mode(knn,X_test_scaled)

              precision    recall  f1-score   support

           0      0.531     0.944     0.680        18
           1      0.500     0.529     0.514        17
           2      0.571     0.667     0.615        18
           3      1.000     0.176     0.300        17
           4      0.786     0.611     0.688        18

    accuracy                          0.591        88
   macro avg      0.678     0.586     0.559        88
weighted avg      0.676     0.591     0.563        88



In [None]:
# Implement ensemble using the above models
def ensemble_acc(clf,X,Y):
  accuracy_list = []
  y_pred = []
  y_true = []

  i = 0
  label = 0
  count = 0
  for k in test_nframes:
    for j in k:
      pred0 = clf[0].predict(X[i:i+j,:])
      pred1 = clf[1].predict(X[i:i+j,:])
      pred2 = clf[2].predict(X[i:i+j,:])
      pred = []
      for m in range(len(pred0)):
        try:
          val = mode([pred0[m],pred1[m],pred2[m]])
        except:
          val = pred0[m]
        pred.append(val)
      try:
        val = mode(pred)
      except:
        count = count + 1
        val = round(np.mean(pred))
      y_pred.append(val)
      y_true.append(label)
      i = i + j
    label = label + 1

  # Print the precision and recall, among other metrics
  print(metrics.classification_report(y_true, y_pred, digits=3))

ensemble_acc([clf,dtree,knn],X_test_scaled,Y_test)

              precision    recall  f1-score   support

           0      0.486     0.944     0.642        18
           1      0.667     0.353     0.462        17
           2      0.706     0.667     0.686        18
           3      0.800     0.235     0.364        17
           4      0.591     0.722     0.650        18

    accuracy                          0.591        88
   macro avg      0.650     0.584     0.560        88
weighted avg      0.648     0.591     0.564        88

