In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [2]:
import gc
gc.enable()
gc.collect()

0

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, PReLU

In [4]:
import librosa, glob, os
import librosa.display
import matplotlib.pyplot as plt

In [5]:
file_list = glob.glob("wav\*.wav")
TEST_PERCENTAGE = 0.2

In [129]:
def load_audio_data(file_name):
    y, sr = librosa.load(file_name, mono="mono")
    S, phase = librosa.magphase(librosa.stft(y))
    return (y, sr, S)

def get_features(y, sr, S):
    ms    = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    mfcc  = librosa.feature.mfcc(S=librosa.power_to_db(ms), n_mfcc=20)

    sp_cent =librosa.feature.spectral_centroid(y=y, sr=sr) 

    sp_band = librosa.feature.spectral_bandwidth(S=S)

    _S = np.abs(librosa.stft(y))
    sp_contra = librosa.feature.spectral_contrast(S=_S, sr=sr)


    sp_roll  = librosa.feature.spectral_rolloff(S=S, sr=sr)# , roll_percent=0.95)

    rmse = librosa.feature.rmse(S=S)

    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)

    zero_rate = librosa.feature.zero_crossing_rate(y)

    hop_length = 512
    oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, win_length=20, hop_length=hop_length)


    return np.array([mfcc,sp_cent,sp_band,sp_contra,sp_roll,rmse,chroma_cens,zero_rate, tempogram])

def get_3features(y, sr, S):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc = np.mean(mfcc.T, axis=0)
    
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr, n_chroma=20)
    chroma_cens = np.mean(chroma_cens.T, axis=0)
    
    hop_length = 512
    oenv = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
    tempogram = librosa.feature.tempogram(onset_envelope=oenv, sr=sr, win_length=20, hop_length=hop_length)
    tempogram = np.mean(tempogram.T, axis=0)
    
    return  np.array([[q,w,e] for q,w,e in zip(mfcc, chroma_cens, tempogram)])

#	return np.array([mfcc,])

In [122]:
def flatted_resized(messed_matrix):
    flatted_matrix = [messed_matrix[i].reshape([1, len(messed_matrix[i])*len(messed_matrix[i][0])]) for i in range(len(messed_matrix))]
    flatted_resized_features = [0]*len(flatted_matrix)

    for i in range(len(flatted_matrix)):
        k = 0
        while( (flatted_matrix[i].shape[1]-k) %10 != 0):
            k+=1
        flatted_resized_features[i] = flatted_matrix[i][0][k:]

    return flatted_resized_features

In [123]:
def split2fragment(flatted_resized_features):
    temp = []
    for whole_frame in flatted_resized_features:
        tp = []
        interval = int(len(whole_frame)/10)
        #print(interval)
        for i in range(0, len(whole_frame), interval):
            try : tp.append(whole_frame[i:i+interval])
            except : pass
        temp.append(tp)

#     for i in range(len(temp)):
#         assert len(temp[2])==10

    #len(temp[0][0]), len(temp[1][0]), len(temp[2][0])

#frame1.squeeze()
#frame1 = np.hstack(frame1) #stack the arrays in sequence horizontally
    
    one_file = list()
    for n in range(10):
        extract_features = np.array([temp[i][n] for i in range(len(temp))])
        flatten_features = np.hstack(extract_features)
        one_file.append(flatten_features)
    
    return np.array(one_file)
    #one_file = np.array([one_file])
    
#[len(frame1[i]) for i in range(len(frame1))]

In [124]:
train_file_list = file_list[:int(len(file_list)*(1-TEST_PERCENTAGE))]
test_file_list  = file_list[int(len(file_list)*(1-TEST_PERCENTAGE)):]
len(train_file_list), len(test_file_list)

(428, 107)

In [131]:
train_y_sr_list     = list()
test_y_sr_list     = list()

for file_name in train_file_list:
    y, sr, S                 = load_audio_data(file_name)
    onefile_data           = get_3features(y, sr, S)
    #feature_values           = get_features(y, sr, S)
    #flatted_resized_features = flatted_resized(feature_values)
    #onefile_data             = split2fragment(flatted_resized_features)

    train_y_sr_list.append(onefile_data)
    print("one file success")

for file_name in test_file_list:
    y, sr, S                 = load_audio_data(file_name)
    onefile_data           = get_3features(y, sr, S)
    #feature_values           = get_features(y, sr, S)
    #flatted_resized_features = flatted_resized(feature_values)
    #onefile_data             = split2fragment(flatted_resized_features)
    
    test_y_sr_list.append(onefile_data)

one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file success
one file succe

In [132]:
len(train_y_sr_list), len(train_y_sr_list[0]), len(train_y_sr_list[0][1]), len(train_y_sr_list[3][1]) 

(428, 20, 3, 3)

In [133]:
emotion_label = list()
for file_name in file_list:
	emotion_label.append(file_name[-6])

emotion = list(set(emotion_label)) #  7 kind of emotion

encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(emotion_label)
onehot_label = np_utils.to_categorical(encoded_Y)
train_emotion_label = np.array(onehot_label[:int(len(file_list)*(1-TEST_PERCENTAGE))])
test_emotion_label  = np.array(onehot_label[int(len(file_list)*(1-TEST_PERCENTAGE)):])

# from keras.utils.np_utils import to_categorical
# categorical_labels = to_categorical(int_labels, num_classes=None)

encoded_Y, onehot_label

(array([2, 4, 6, 2, 4, 5, 6, 6, 0, 2, 3, 4, 5, 6, 0, 2, 4, 5, 6, 6, 2, 2, 3,
        4, 6, 2, 3, 4, 5, 6, 6, 0, 3, 4, 5, 6, 4, 5, 6, 3, 4, 5, 6, 0, 1, 4,
        4, 6, 6, 0, 2, 3, 4, 6, 6, 0, 0, 2, 3, 4, 5, 6, 2, 3, 4, 5, 6, 2, 3,
        4, 5, 6, 2, 3, 4, 5, 5, 6, 0, 2, 2, 3, 4, 6, 2, 3, 4, 5, 6, 2, 3, 4,
        5, 6, 0, 2, 3, 4, 5, 6, 6, 0, 2, 3, 4, 5, 6, 1, 2, 4, 6, 1, 1, 3, 6,
        2, 3, 4, 6, 1, 3, 4, 5, 6, 6, 1, 4, 5, 6, 6, 1, 4, 6, 4, 5, 6, 6, 1,
        2, 2, 3, 4, 5, 6, 1, 4, 6, 0, 4, 6, 0, 4, 6, 0, 2, 3, 4, 6, 2, 4, 6,
        6, 0, 3, 5, 6, 0, 0, 3, 5, 6, 0, 1, 2, 3, 0, 3, 4, 6, 3, 5, 6, 0, 3,
        6, 2, 3, 6, 0, 0, 3, 4, 6, 1, 2, 3, 4, 5, 6, 0, 2, 4, 6, 0, 2, 2, 3,
        4, 5, 6, 0, 3, 5, 6, 0, 1, 2, 3, 4, 6, 0, 2, 4, 5, 6, 2, 3, 4, 5, 6,
        6, 0, 2, 3, 4, 5, 6, 0, 0, 3, 4, 5, 6, 2, 3, 4, 6, 0, 1, 4, 6, 6, 6,
        0, 3, 4, 5, 6, 0, 3, 6, 5, 6, 0, 1, 2, 4, 6, 6, 6, 3, 5, 0, 5, 6, 0,
        3, 6, 0, 1, 1, 2, 3, 4, 6, 0, 1, 2, 3, 4, 5, 6, 0, 2, 3, 5, 6, 0, 1,

In [134]:
len(train_y_sr_list), len(train_emotion_label)

(428, 428)

In [30]:
#len(train_y_sr_list), len(train_y_sr_list[0]), len(train_y_sr_list[0][0]), np.array(train_emotion_label).shape

from keras.preprocessing import sequence

train_y_sr_list_pad = np.array([sequence.pad_sequences(file_, maxlen=1500) for file_ in train_y_sr_list ])
test_y_sr_list_pad = np.array([sequence.pad_sequences(file_, maxlen=1500) for file_ in test_y_sr_list ])

In [15]:
train_y_sr_list_pad.shape, train_emotion_label.shape

((428, 10, 1500), (428, 7))

In [157]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, PReLU, BatchNormalization, Flatten

data_dim = 3 # <--523, fixed by padding
timesteps = 20
num_classes = 7


model = Sequential()
# model2.add(LSTM(256, input_shape=(2,10), return_sequences=True))

model.add(BatchNormalization(input_shape=(timesteps, data_dim)))


model.add(LSTM(units=256, input_shape=(timesteps, data_dim), return_sequences=True))
# (10,523) -> 1,523 : (frame1, [feature values...])
#        -> 1,523 : (frame2, [feature values...])

model.add(BatchNormalization())

model.add(LSTM(256, return_sequences=True))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(LSTM(256))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_classes, activation='softmax'))

from keras import optimizers
#Default : keras.optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)

sgd = optimizers.SGD(lr=0.000001, decay=1e-7, momentum=0.7, nesterov=True)
rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08) #This optimizer is usually a good choice for recurrent neural networks.



# model.add(Dropout(0.2))
# model.add(TimeDistributedDense(in_out_neurons))  
# model.add(Activation("linear")) 


#model.compile(optimizers.Adam(lr=1e-1), 'categorical_crossentropy', metrics=['accuracy'])

model.compile(loss='categorical_crossentropy',optimizer=rmsprop, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_84 (Batc (None, 20, 3)             12        
_________________________________________________________________
lstm_96 (LSTM)               (None, 20, 256)           266240    
_________________________________________________________________
batch_normalization_85 (Batc (None, 20, 256)           1024      
_________________________________________________________________
lstm_97 (LSTM)               (None, 20, 256)           525312    
_________________________________________________________________
batch_normalization_86 (Batc (None, 20, 256)           1024      
_________________________________________________________________
activation_53 (Activation)   (None, 20, 256)           0         
_________________________________________________________________
dropout_53 (Dropout)         (None, 20, 256)           0         
__________

In [158]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='3features_rmsprop_rl0000001_relu.hdf5', verbose=1, save_best_only=True)

In [159]:
import time
time_start = time.time()

#model3.fit(train_y_sr_list_pad, train_emotion_label, epochs=50, batch_size=12, validation_split=0.2, callbacks=[checkpointer])
model.fit(np.array(train_y_sr_list), train_emotion_label, epochs=20, batch_size=64, validation_split=0.2, callbacks=[checkpointer])

"""
Keras: why does loss decrease while val_loss increase?

1. (this may be a duplicate) It looks like your model is over fitting,
that is just memorizing the training data. In general a model that over fits can be improved by adding more dropout,
or training and validating on a larger data set. 
Explain more about the data/features and the model for further ideas.

2. You case is strange because your validation loss never got smaller. Your learning rate is suspiciously high,
typical learning rates are about 0.001. 


3. You're overfitting. By making your model too complex, your model is finding patterns in your data that are not really there
(these "patterns" are just errors/random noise). 
Your model is then using these false patterns to make predictions when really it should be ignoring them.




Audio files are sequences with different lengths.

There are multiple ways to deal with variable length inputs. 
You typically feed the input, which is of fixed dimension, 
to a neural net multiple times, once for each audio frame. 
Then, the network learns from the sequence using an architecture like RNN, LSTM, 
or seq2seq (which is in flux, but in contrib/seq2seq). You can also use a simple DNN (feed-forward) architecture.



"""
print("cost time : {} second".format(time.time() - time_start))



#currently, best result is, OptimizerRMSprop, lr= 0.01, batch_size = 64

Train on 342 samples, validate on 86 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
cost time : 42.96045708656311 second


In [160]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import keras.utils.np_utils

np.random.seed(0)

def categorical_probas_to_classes(p):
    return np.argmax(p, axis=1)

def probas_to_classes(y_pred):
    if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
        return categorical_probas_to_classes(y_pred)
    return np.array([1 if p > 0.5 else 0 for p in y_pred])

def evaluate(model, inputs, outputs):
    y_prob = model.predict_proba(inputs, verbose=0)
    y_pred = probas_to_classes(y_prob)
    y_true = np.argmax(outputs, 1)

    roc = roc_auc_score(outputs, y_prob)
    print ("ROC:",  round(roc,3))

    # evaluate the model
    score, accuracy = model.evaluate(inputs, outputs, batch_size=32)
    print("\nAccuracy = {:.2f}".format(accuracy))

    # the F-score gives a similiar value to the accuracy score, but useful for cross-checking
    p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
    print ("F-Score:", round(f,2))
    
    return roc, accuracy


print("Evaluating model...")
roc, acc = evaluate(model, np.array(test_y_sr_list), test_emotion_label)

Evaluating model...
ROC: 0.75
Accuracy = 0.25
F-Score: 0.25


In [161]:
model.evaluate(np.array(test_y_sr_list), test_emotion_label)



[1.8409418647534379, 0.25233644915518361]

In [None]:
import h5py
model.save('Unit128_Drop30_1sig3relu__.h5')
history.params