In [417]:
import soundfile
import os
import glob
import numpy as np
from sklearn.model_selection import train_test_split
import librosa
import warnings; warnings.filterwarnings('ignore')

In [408]:
emotions ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

### Data for binary classification


In [409]:
def load_extract_features(data_path):

    '''
    load_extract_features() is a function that is used to load all the audio files one at a time, compute their features and return the features as well as the target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Returns:
    1. Features
    2. Binary Target Values
    '''
    final_features,target_emotions, binary_label = [],[], []
    count = 0
    
    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.
        
        name = os.path.basename(i)
        #We split the name of the file to understand the emotion associated with the file.
        split = name.split("-")
        #We know that the third identifier is associated with the emotion of the audio file. Hence, we use [2] as it represents the third identifier.
        emotion = emotions[split[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(0)
        elif emotion in ['angry', 'fearful']:
            binary_label.append(1)
        else:
            continue
        
        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate
            
            #Below is the code to extract the Mel spectrogram features
            #128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr / 2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,):
                melspectrogram = np.zeros(128)
            
            #Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)
            
            #12 is the number of pitch classes
            if chromagram.shape != (12,):
                chromagram = np.zeros(12)
                
            features=np.array([])
            features=np.hstack((chromagram, melspectrogram))
        
            final_features.append(features)
            target_emotions.append(emotion)
            
            count += 1
            if count % 100 == 0:
                print("Processed Audio File Number: ", count)
    
    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

In [410]:
#Please change the path below to the path of the folder saved on your computer.
data_path = './Audio_Speech_Actors_01-24'
X, binary_label = load_extract_features(data_path)

Processed Audio File Number:  100
Processed Audio File Number:  200
Processed Audio File Number:  300
Processed Audio File Number:  400
Processed Audio File Number:  500
Processed Audio File Number:  600
Processed Audio File Number:  700


In [411]:
X_aug = np.hstack((X, np.ones((np.shape(X)[0], 1))))
binary_label[binary_label == 0] = -1
X_train, X_test, Y_train, Y_test = train_test_split(
    X_aug, binary_label, test_size=0.3, random_state=12345)

In [412]:
REG = 1 # regularization constant
hinge = lambda x, y, weight : np.maximum(0, 1-(y * (weight @ x)))
def hinge_sum(x, y, weight):
    sum = 0.0
    for i in range(np.shape(x)[0]):
        sum += hinge(x[i], y[i], weight)
    return (sum/np.shape(x)[0])
cost = lambda weight : REG * 0.5 * np.dot(weight, weight) # weight cost function

def compute_gradient(x, y, weight):
    grad = np.zeros(np.shape(X_train)[1])
    for i in range(np.shape(x)[0]):
        if(abs(hinge(x[i], y[i], weight)) <= 0.0001):
            grad += REG * weight
        else:
            grad += (REG * weight - y[i]*x[i])
    return grad/np.shape(x)[0]

In [413]:
def train_SVM(X_train, Y_train):
    weight = np.zeros(np.shape(X_train)[1])
    init_cost = hinge_sum(X_train, Y_train, weight) + cost(weight)
    new_cost = -10000
    GRAD_DESC_REG = 0.01 # Step size 
    i = 0
    while(new_cost < init_cost-0.00001):
        if i != 0:
            init_cost = new_cost
        grad = compute_gradient(X_train, Y_train, weight)
        weight = weight - GRAD_DESC_REG * grad
        new_cost = hinge_sum(X_train, Y_train, weight) + cost(weight)
        i += 1
    print("Gradient descent completed in", i, "iterations")
    return weight


In [414]:
def predict(x, weight):
    return -1 if (weight @ x <= 0) else 1

weight = train_SVM(X_train, Y_train)
train_error = 0
for i in range(np.shape(X_train)[0]):
    pred = predict(X_train[i], weight)
    if pred != Y_train[i]:
        train_error += 1

print("Accuracy of classifier on training data:", 1-(train_error/np.shape(X_train)[0]))

test_error = 0
for i in range(np.shape(X_test)[0]):
    pred = predict(X_test[i], weight)
    if pred != Y_test[i]:
        test_error += 1

print("Accuracy of classifier on testing data:", 1-(test_error/np.shape(X_test)[0]))

Gradient descent completed in 76 iterations
Accuracy of classifier on training data: 0.6629422718808193
Accuracy of classifier on testing data: 0.6753246753246753


In [415]:
aver = np.mean(X, axis=0)
cov_X = np.cov(X, rowvar=False)
eig_val, eig_vec = np.linalg.eig(cov_X)
ord = eig_val.argsort()[::-1]
eig_val = eig_val[ord]
eig_vec = eig_vec[:, ord]
sum_eig = np.sum(eig_val)
trans_mat = []
total = 0
i = 0
while(total/sum_eig < 0.99):
    trans_mat.append(eig_vec[:, i])
    total += eig_val[i]
    i += 1
trans_mat = np.array(trans_mat)
reduced_X = X @ trans_mat.T
reduced_X = np.hstack((reduced_X, np.ones((np.shape(X)[0], 1))))
print("Dimension of data reduced to", np.shape(reduced_X), 
      "while preserving approximately 99% of information")
X_train, X_test, Y_train, Y_test = train_test_split(
    reduced_X, binary_label, test_size=0.3, random_state=12345)

Dimension of data reduced to (768, 24) while preserving approximately 99% of information


In [416]:
weight = train_SVM(X_train, Y_train)
train_error = 0
for i in range(np.shape(X_train)[0]):
    pred = predict(X_train[i], weight)
    if pred != Y_train[i]:
        train_error += 1

print("Accuracy of reduced classifier on training data:", 
      1-(train_error/np.shape(X_train)[0]))

test_error = 0
for i in range(np.shape(X_test)[0]):
    pred = predict(X_test[i], weight)
    if pred != Y_test[i]:
        test_error += 1

print("Accuracy of reduced classifier on testing data:", 
      1-(test_error/np.shape(X_test)[0]))

Gradient descent completed in 30 iterations
Accuracy of reduced classifier on training data: 0.7188081936685289
Accuracy of reduced classifier on testing data: 0.7402597402597403
