In [1]:
import os
import tensorflow as tf
from tensorflow.python.platform import gfile
import sklearn
import numpy as np
from scipy import stats
import pandas as pd
import librosa
import pickle
import math
import matplotlib.pyplot as plt
from librosa import display
from collections import Counter
%matplotlib inline

In [2]:
def compute_features(tids):
    successful_tids = []
    successful_features = []
    for tid in tids:
        try:
            filepath = get_audio_path('dataset/audio', tid)

            try:
                ### do not change here !
                x, sr = librosa.load(filepath, sr=44100, mono=True, duration=20)
                x = x.tolist()

                new_x = []
                while len(new_x) < 44100 * 20:
                    new_x.extend(x)
                new_x = new_x[:44100 * 20]
                x = np.array(new_x)
                ###
                front_x = x[:2205]
                end_x = x[41895:]
                x = np.append(front_x, x)
                x = np.append(x, end_x)

                hop_length=512
                timeseries_length = 18
                for i in range(200):
                    data = np.zeros((40, timeseries_length), dtype=np.float64)
                    mini_x = x[i*4410:i*4410 + 8820]
                    stft = np.abs(librosa.stft(mini_x, n_fft=2048, hop_length=hop_length))
                    mel = librosa.feature.melspectrogram(sr=sr, S=stft ** 2)
                    del stft
                    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
                    spectral_center = librosa.feature.spectral_centroid(mini_x, sr=sr, hop_length=hop_length)
                    chroma = librosa.feature.chroma_stft(mini_x, sr=sr, hop_length=hop_length)
                    spectral_contrast = librosa.feature.spectral_contrast(mini_x, sr=sr, hop_length=hop_length)
                    data[0:20, :] = mfcc
                    data[20:21, :] = spectral_center
                    data[21:33, :] = chroma
                    data[33:40, :] = spectral_contrast
                    successful_tids.append(tid)
                    successful_features.append(data)

            except Exception as e:
                print(filepath, e)



        except Exception as e:
            print('{}: {}'.format(tid, repr(e)))

    return successful_tids, successful_features

def feature_examples(tid):
    # example of various librosa features
    # please check [https://librosa.github.io/librosa/feature.html]
    threshold = 1278900
    try:
        filepath = get_audio_path('dataset/audio', tid)
        ### do not change here !
        x, sr = librosa.load(filepath, sr=44100, mono=True, duration=20)
        x = x.tolist()
        origin_length = len(x)

        new_x = []
        while len(new_x) < 44100 * 20:
            new_x.extend(x)
        new_x = new_x[:44100 * 20]
        x = np.array(new_x)
        ###

        # zero_crossing_rate
        # returns (1,t)
        f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512)


        cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12,
                                 n_bins=7 * 12, tuning=None))
        assert cqt.shape[0] == 7 * 12
        assert np.ceil(len(x) / 512) <= cqt.shape[1] <= np.ceil(len(x) / 512) + 1

        # chroma_cqt
        # returns (n_chroma, t)
        f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7)

        # chroma_cqt
        # returns (n_chroma, t)
        f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7)

        del cqt
        stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
        assert stft.shape[0] == 1 + 2048 // 2
        assert np.ceil(len(x) / 512) <= stft.shape[1] <= np.ceil(len(x) / 512) + 1
        del x

        # chroma_stft
        # returns (n_chroma, t)
        f = librosa.feature.chroma_stft(S=stft ** 2, n_chroma=12)

        # rmse
        # returns (1,t)
        f = librosa.feature.rmse(S=stft)

        # spectral_centroid
        # returns (1,t)
        f = librosa.feature.spectral_centroid(S=stft)

        # spectral_bandwidth
        # returns (1,t)
        f = librosa.feature.spectral_bandwidth(S=stft)

        # spectral_contrast
        # returns (n_bands+1, t)
        f = librosa.feature.spectral_contrast(S=stft, n_bands=6)

        # spectral_rolloff
        # returns (1,t)
        f = librosa.feature.spectral_rolloff(S=stft)

        # mfcc
        # returns (n_mfcc, t)
        mel = librosa.feature.melspectrogram(sr=sr, S=stft ** 2)
        del stft
        f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)

    except Exception as e:
        print('{}: {}'.format(tid, repr(e)))
        return tid, 0


def get_audio_path(audio_dir, track_id):
    return os.path.join(audio_dir, track_id + '.wav')

In [3]:
# fixed
meta_path = 'dataset/audio_list.csv'
label_path = 'dataset/labels.pkl'
val_set_number = 0

metadata_df = pd.read_csv(meta_path)
train_meta_df = metadata_df[metadata_df['set'] != val_set_number]
val_meta_df = metadata_df[metadata_df['set'] == val_set_number]
# metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
train_track_ids = train_meta_df['FileName'].values
val_track_ids = val_meta_df['FileName'].values

In [4]:
ids, features = compute_features(train_track_ids)
val_ids, val_features = compute_features(val_track_ids)
Xd = np.array(features)
Xd_val = np.array(val_features)
Xd.shape, Xd_val.shape



KeyboardInterrupt: 

In [5]:
Xd = pickle.load(open('mix.pkl', 'rb'))
Xd_val = pickle.load(open('mix_val.pkl', 'rb'))

In [6]:
labels = pickle.load(open(label_path, 'rb'))

def get_labels(name_list):
    # get labels from label dictionary
    # [[list of hihats],[list of kicks], [list of snares],
    #   [list of hihats],[list of kicks], [list of snares]],...
#     print("name_list", name_list)
    label = []
    for x in name_list:
        label.append(labels[x])
    label = np.array(label)
    return label

def to_eight(onehot):
    hihat = onehot[:,0,:].flatten()
    kick = onehot[:,1,:].flatten()
    snare = onehot[:,2,:].flatten()
    n = onehot.shape[0]*onehot.shape[2]
    yd = np.zeros((n, 8))
    for i in range(n):
        if hihat[i] == 1 and kick[i] != 1 and snare[i] != 1:
            yd[i,1] = 1
        elif hihat[i] != 1 and kick[i] == 1 and snare[i] != 1:
            yd[i,2] = 1
        elif hihat[i] != 1 and kick[i] != 1 and snare[i] == 1:
            yd[i,3] = 1        
        elif hihat[i] == 1 and kick[i] == 1 and snare[i] != 1:
            yd[i,4] = 1
        elif hihat[i] == 1 and kick[i] != 1 and snare[i] == 1:
            yd[i,5] = 1
        elif hihat[i] != 1 and kick[i] == 1 and snare[i] == 1:
            yd[i,6] = 1
        elif hihat[i] == 1 and kick[i] == 1 and snare[i] == 1:
            yd[i,7] = 1     
        else:
            yd[i,0] = 1    
    return yd

yd = to_eight(np.array(get_labels(train_track_ids)))
yd_val = to_eight(np.array(get_labels(val_track_ids)))
yd.shape, yd_val.shape

((11600, 8), (5800, 8))

In [6]:
with open('mix.pkl', 'wb') as mixf:
    pickle.dump(Xd, mixf, protocol=pickle.HIGHEST_PROTOCOL)
with open('y.pkl', 'wb') as yf:
    pickle.dump(yd, yf, protocol=pickle.HIGHEST_PROTOCOL)
with open('mix_val.pkl', 'wb') as mix_valf:
    pickle.dump(Xd_val, mix_valf, protocol=pickle.HIGHEST_PROTOCOL)
with open('y_val.pkl', 'wb') as y_valf:
    pickle.dump(yd_val, y_valf, protocol=pickle.HIGHEST_PROTOCOL)    

In [7]:
# Model for training and validation

tf.reset_default_graph()

n_input = 40 * 18
n_classs = 8
image_height = 40
image_width = 18
# fully-connected layer property
hidden1 = 1024
hidden2 = 1024
dropout_rate = 0.5
# convolutional layer property
conv_size = 3
n_filter1 = 64
n_filter2 = 128
n_filter3 = 256
# pooling layer property
pool_size = 2
# Placeholder and variables
# TODO : declare placeholder and variables

X = tf.placeholder(tf.float32, [None, image_height, image_width])
y = tf.placeholder(tf.int64, [None, n_classs])
is_training = tf.placeholder(tf.bool)

# Build model
# TODO : build your model here
# Model
def from_eight(y_preds):
    hihat = [0] * len(y_preds)
    kick = [0] * len(y_preds)
    snare = [0] * len(y_preds)
    for i in range(len(y_preds)):
        if y_preds[i, 1] == 1:
            hihat[i] = 1
        elif y_preds[i, 2] == 1:
            kick[i] = 1
        elif y_preds[i, 3] == 1:
            snare[i] = 1
        elif y_preds[i, 4] == 1:
            hihat[i] = 1
            kick[i] = 1
        elif y_preds[i, 5] == 1:
            hihat[i] = 1
            snare[i] = 1
        elif y_preds[i, 6] == 1:
            kick[i] = 1
            snare[i] = 1
        elif y_preds[i, 7] == 1:
            hihat[i] = 1
            kick[i] = 1
            snare[i] = 1
    return [hihat, kick, snare]

def onehot(y):
    yd = np.zeros_like(y)
    for n, i in enumerate(np.argmax(y, 1)):
        yd[n,i] = 1
    return yd        

def calculate_average_F1_score(pred_lists, label_lists):
    # calculate average F1 score (hihat, kick, snare)
    # shape of each list is 3*200
    avg_f1_score = 0
    for pred_list, label_list in zip(pred_lists, label_lists):
        counts = Counter(zip(pred_list, label_list))
        tp = counts[1,1]
        fp = counts[1,0]
        fn = counts[0,1]
        try:
            precision = tp / (tp+fp)
        except ZeroDivisionError:
            precision = 0

        try:
            recall = tp / (fn + tp)
        except ZeroDivisionError:
            recall = 0

        try:
            f1 = 2*(precision*recall / (precision+recall))
        except ZeroDivisionError:
            f1 = 0
        avg_f1_score+=f1

#         print(precision, recall, f1)

    avg_f1_score /= 3
    return avg_f1_score

def model(X,y,is_training):
    regularizer = tf.contrib.layers.l2_regularizer(1e-6)
    activation = tf.nn.elu
    init = tf.contrib.layers.xavier_initializer()
    
    x_reshaped = tf.reshape(X, [-1, image_height, image_width, 1])
    c11 = tf.layers.conv2d(x_reshaped, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c12 = tf.layers.conv2d(c11, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp1 = tf.layers.max_pooling2d(c11, pool_size, 2,  "same")
    c21 = tf.layers.conv2d(mp1, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c22 = tf.layers.conv2d(c21, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp2 = tf.layers.max_pooling2d(c21, pool_size, 2,  "same")
    c3 = tf.layers.conv2d(mp2, 1, 1, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    if is_training is not None:
        c3 = tf.nn.dropout(c3, dropout_rate) * dropout_rate
    flat = tf.reshape(c3, [-1, 50])

    w_fc1 = tf.get_variable("W_fc1", shape=[n_input + 50, hidden1], initializer=init, regularizer=regularizer)
    b_fc1 = tf.get_variable("b_fc1", shape=[hidden1], initializer=init, regularizer=regularizer)
    w_fc2 = tf.get_variable("W_fc2", shape=[hidden1, hidden2], initializer=init, regularizer=regularizer)
    b_fc2 = tf.get_variable("b_fc2", shape=[hidden2], initializer=init, regularizer=regularizer)
    w_out = tf.get_variable("W_out", shape=[hidden2, n_classs], initializer=init, regularizer=regularizer)
    b_out = tf.get_variable("b_out", shape=[n_classs], initializer=init, regularizer=regularizer)
    
    x_reshaped2 = tf.reshape(X, [-1, n_input])
    x_concat = tf.concat([x_reshaped2, flat], 1)
    z1 = tf.matmul(x_concat, w_fc1) + b_fc1
    a1 = tf.contrib.layers.batch_norm(z1)
    fc1 = tf.nn.elu(a1)
    if is_training is not None:
        fc1 = tf.nn.dropout(fc1, dropout_rate) * dropout_rate
    z2 = tf.matmul(fc1, w_fc2) + b_fc2
    a2 = tf.contrib.layers.batch_norm(z2)
    fc2 = tf.nn.elu(a2)
    if is_training is not None:
        fc2 = tf.nn.dropout(fc2, dropout_rate) * dropout_rate   
    y_out = tf.matmul(fc2, w_out) + b_out
    return y_out

y_out = model(X,y,is_training)

# Loss and optimizer
# TODO : declare loss and optimizer operation

total_loss = tf.losses.softmax_cross_entropy(y,logits=y_out) 
mean_loss = tf.reduce_mean(total_loss)
optimizer = tf.train.AdamOptimizer(1e-5) 
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
    train_step = optimizer.minimize(mean_loss)    
correct_prediction = tf.equal(tf.argmax(y_out,1), tf.argmax(y,1))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
train_losses = []
val_losses = []
best_loss = 100

In [11]:
# properties
# General
# TODO : declare additional properties
# not fixed (change or add property as you like)
batch_size = 5800
epoch_num = 10000
print_every = 100

# fixed
# True if you want to train, False if you already trained your model
# TODO : IMPORTANT !!! Please change it to False when you submit your code
is_train_mode = False
train_validation = True
validation = True
# TODO : IMPORTANT !!! Please specify the path where your best model is saved
# example : checkpoint/run-0925-0348
checkpoint_path = 'checkpoint/concat_1024_mix_18_elu'

# X, y, mean_loss,correct_prediction,train_step, accuracy = my_model(lr = 2e-4, rl = 1e-4,  is_training= is_train_mode)
#load data

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess, checkpoint_path)			
    if is_train_mode:
        variables = [mean_loss,correct_prediction, y_out, train_step]
        iter_cnt = 0
        best_f1 = 0
        for e in range(epoch_num):                   
            train_indicies = np.arange(Xd.shape[0])
            np.random.shuffle(train_indicies)
            correct = 0
            losses = []
            f1_scores = []
            for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
                start_idx = (i*batch_size)%Xd.shape[0]
                idx = train_indicies[start_idx:start_idx+batch_size]
                feed_dict = {X: Xd[idx,:],
                             y: yd[idx] }
                actual_batch_size = yd[idx].shape[0]
                loss, corr, y_outs, _ = sess.run(variables,feed_dict=feed_dict)
                losses.append(loss*actual_batch_size)
                correct += np.sum(corr)
                f1_score = calculate_average_F1_score(from_eight(onehot(y_outs)), from_eight(yd[idx]))
                f1_scores.append(f1_score*actual_batch_size)
                if is_train_mode and (iter_cnt % print_every) == 0:
                    print("Iteration {0}: with minibatch training loss = {1:.3g}, accuracy of {2:.2g}, and f1 score of {3:.2g}"\
                          .format(iter_cnt,loss,np.sum(corr)/actual_batch_size, f1_score))
                iter_cnt += 1
            total_correct = correct/Xd.shape[0]
            total_loss = np.sum(losses)/Xd.shape[0]
            total_f1 = np.sum(f1_scores)/Xd.shape[0]
            print("Epoch {2}, Overall loss = {0:.3g}, accuracy of {1:.3g} and f1 score of {3:.2g}"\
              .format(total_loss,total_correct,e+1, total_f1))
            if (e % 10 == 0) and  train_validation:
                train_losses.append(total_loss)
                correct_val = 0
                losses_val = []
                f1s_val = []
                val_indicies = np.arange(Xd_val.shape[0])
                for j in range(int(math.ceil(Xd_val.shape[0]/batch_size))):
                    start_idx = (j*batch_size)%Xd_val.shape[0]
                    idx = val_indicies[start_idx:start_idx+batch_size]
                    feed_dict_val = {X: Xd_val[idx, :],
                                 y: yd_val[idx] }
                    actual_batch_size = yd_val[idx].shape[0]
                    val_loss, val_corr, y_out_val = sess.run([mean_loss, correct_prediction, y_out] ,feed_dict=feed_dict_val)
                    losses_val.append(val_loss*actual_batch_size)
                    correct_val += np.sum(val_corr)
                    f1_score_val = calculate_average_F1_score(from_eight(onehot(y_out_val)), from_eight(yd_val[idx]))
                    f1s_val.append(f1_score_val*actual_batch_size)
                total_val_loss = np.sum(losses_val)/Xd_val.shape[0]
                total_val_correct = correct_val/Xd_val.shape[0]
                val_losses.append(total_val_loss)
                val_f1 = np.sum(f1s_val)/Xd_val.shape[0]
                print("Validation loss, Overall loss = {0:.3g}, accuracy of {1:.3g} and f1 score of {2:.3g}".format(total_val_loss, total_val_correct, val_f1))        
                plt.plot(losses)
                plt.grid(True)
                plt.title('Epoch {} Loss'.format(e+1))
                plt.xlabel('minibatch number')
                plt.ylabel('minibatch loss')
                plt.show()
                if total_val_loss < best_loss:
                    best_loss = total_val_loss
             #         output_dir = checkpoint_path + '/run-%02d%02d-%02d%02d' % tuple(localtime(time()))[1:5]
                    output_dir = checkpoint_path
                    if not gfile.Exists(output_dir):
                        gfile.MakeDirs(output_dir)
                    saver.save(sess, output_dir)
                    print('Model saved in file : %s' % output_dir)             
        plt.plot(train_losses)
        plt.plot(val_losses)
        plt.grid(True)
        plt.title('Epoch {} Loss'.format(epoch_num))
        plt.xlabel('epoch number')
        plt.ylabel('epoch loss')
        plt.show()                    
        print('Training finished !')


    if validation:
        correct = 0
        losses = []
        f1s_val = []
        val_indicies = np.arange(Xd_val.shape[0])
        for j in range(int(math.ceil(Xd_val.shape[0]/batch_size))):
            start_idx = (j*batch_size)%Xd_val.shape[0]
            idx = val_indicies[start_idx:start_idx+batch_size]
            feed_dict_val = {X: Xd_val[idx, :],
                         y: yd_val[idx] }
            actual_batch_size = yd_val[idx].shape[0]
            loss, corr, y_out_val = sess.run([mean_loss, correct_prediction, y_out],feed_dict=feed_dict_val)
            losses.append(loss*actual_batch_size)
            correct += np.sum(corr)
            f1_score_val = calculate_average_F1_score(from_eight(onehot(y_out_val)), from_eight(yd_val[idx]))
            f1s_val.append(f1_score_val*actual_batch_size)
        total_val_correct = correct/Xd_val.shape[0]
        total_val_loss = np.sum(losses)/Xd_val.shape[0]
        val_f1 = np.sum(f1s_val)/Xd_val.shape[0]
        print("Validation loss, Overall loss = {0:.3g}, accuracy of {1:.3g} and f1 score of {2:.3g}"\
          .format(total_val_loss,total_val_correct, val_f1))


INFO:tensorflow:Restoring parameters from checkpoint/concat_1024_mix_18_elu
Validation loss, Overall loss = 0.302, accuracy of 0.932 and f1 score of 0.933


In [None]:
sum(yd_val)/sum(sum(yd_val))

In [None]:
plt.plot(train_losses)
plt.plot(val_losses)
plt.grid(True)
plt.title('Epoch {} Loss'.format(epoch_num))
plt.xlabel('epoch number')
plt.ylabel('epoch loss')
plt.show()                 