In [1]:
import soundfile
import os, pickle
import numpy as np
from pathlib import Path
from glob import glob
import torch
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import librosa
from model import LSTM_fixed_len as model
from model import Trainer
from sklearn.metrics import f1_score

In [2]:
device = torch.device('cuda:3')

In [4]:
#set up trainer
model_dump = "project_models/" 
trainer = Trainer(device, dump_folder=model_dump) 

In [5]:
# def extract_feature(file_name, mfcc, chroma, mel):
#     with soundfile.SoundFile(file_name) as sound_file:
#         X, sample_rate = librosa.load(file_name)
#         if chroma:
#             stft=np.abs(librosa.stft(X))
#         result=np.array([])
#         if mfcc:
#             mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
#             result=np.hstack((result, mfccs))
#         if chroma:
#             chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, chroma))
#         if mel:
#             mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, mel))
#     return result

In [6]:
def extract_feature(file_name):
    #https://github.com/terranivium/speech-emotion-recognition/blob/master/speech_emotion_recognition.ipynb
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32") # all depth 32bit float
        sample_rate = sound_file.samplerate # always 16kHz
        
        # pre-emphasis
        pre_emphasis = 0.97
        X = np.append(X[0], X[1:] - pre_emphasis * X[:-1])

        # remove silence
        y = librosa.effects.split(X, top_db=20)
        l = []
        for i in y:
            l.append(X[i[0]:i[1]] )
        X = np.concatenate(l, axis=0)
        
        # extract features
        hop_length=int(0.100*sample_rate)
        n_fft=int(0.500*sample_rate)
        mfccs = librosa.feature.mfcc(y=X, 
                                     sr=sample_rate, 
                                     n_mfcc=40, 
                                     n_mels=40,
                                     power=2.0,
                                     window = 'hamming',
                                     fmin = 0,
                                     fmax = 8000,
                                     hop_length=hop_length,
                                     n_fft=n_fft,
                                     win_length=n_fft,
                                     center=True)  
    return mfccs[1:]

In [7]:
emotions={
  0:'neutral',
  1:'calm',
  2:'happy',
  3:'sad',
  4:'angry',
  5:'fearful',
  6:'disgust',
  7:'surprised'
}

In [8]:
def load_data():
    X,y=[],[]
    all_files = Path("Audio_Speech_Actors_01-24")
    i = 0
    for file in all_files.glob("Actor_*/*.wav"):
#         file_name=os.path.basename(file)
#         emotion = file_name.split("-")[2]
#         emotion = emotion.replace('0','')
#         emotion = int(emotion)-1
#         #feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
#         feature=extract_feature(file)
#         #print(feature.shape, type(feature))
#         x.append(feature)
#         y.append(emotion)
# #         i += 1
# #         if i > 200:
# #             break
    
    
        # get the base name of the audio file
        basename = os.path.basename(file)
        # get the emotion label
        emotion = basename.split("-")[2]
        emotion = emotion.replace('0','')
        emotion = int(emotion)-1
        # extract features
        features = extract_feature(file)
        # add to data
        X.append(features)
#         mul_circumplex_coord = [emotion] * features.shape[1]
#         y.extend(mul_circumplex_coord)
        y.append(emotion)
    return X, y

In [9]:
x, y = load_data()

  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]
  n_fft, y.shape[-1]


In [10]:
max_len = 0
for row in x:
    for features in row:
        if len(features) > max_len:
            max_len = len(features)

In [11]:
max_len

26

In [12]:
X = []
for row in x:
    rowX = []
    for features in row:
        padding = [0] * (max_len-len(features))
        features = list(features)
        features.extend(padding)
        rowX.append(features)
    X.append(rowX)

In [13]:
print(len(X),len(X[0]), len(X[0][0]))

1440 39 26


In [14]:
len(y)

1440

In [15]:
x = torch.Tensor(X)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=9)

In [17]:
x_train, x_test, x_val, y_train, y_test, y_val = torch.Tensor(x_train), torch.Tensor(x_test), torch.Tensor(x_val), torch.LongTensor(y_train),  torch.LongTensor(y_test), torch.LongTensor(y_val)

In [18]:
x_train, x_test, x_val =  x_train.to(device), x_test.to(device), x_val.to(device)

In [19]:
y_train, y_test, y_val =y_train.to(device), y_test.to(device), y_val.to(device)

In [20]:
#x_train,x_test,y_train,y_test = x_train.to(device), x_test.to(device), y_train.to(device), y_test.to(device)

In [21]:
x_train.shape

torch.Size([864, 39, 26])

In [22]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 39


In [23]:
output_size=len(list(emotions.keys()))

set_hyperparameters = [{"learning_rate": 0.001,
                        "hidden_size": 100,
                        "number_layers": 3,
                        "batch_size": 25,
                        "model": 'model1'
                        },
                       {"learning_rate": 0.002,
                        "hidden_size": 300,
                        "number_layers": 5,
                        "batch_size": 100,
                        "model": 'model2'
                       },
                       {"learning_rate": 0.001,
                        "hidden_size": 32,
                        "number_layers": 10,
                        "batch_size": 80,
                        "model": 'model3'
                       },
                       {"learning_rate": 0.0005,
                        "hidden_size": 100,
                        "number_layers": 3,
                        "batch_size": 25,
                        "model": 'model4'
                       },
                       {"learning_rate": 0.001,
                        "hidden_size": 250,
                        "number_layers": 3,
                        "batch_size": 32,
                        "model": 'model5'
                       }
                      ]

In [None]:
for hp in set_hyperparameters:
    trainer.train_model(model, x_train, y_train, x_val, y_val, hp, output_size)

In [None]:
best_model_path = "project_models/model5.pt"
y_pred = trainer.predict(x_test, model, best_model_path)

In [None]:
f = f1_score(y_test, y_pred, average='weighted')

In [None]:
print("F1 Score: {:.2f}%".format(f*100))