In [None]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import math
import librosa
import matplotlib.pyplot as plt
from tensorflow.python.platform import gfile
from time import strftime, localtime, time

In [2]:
# Feature Extraction function

def get_features(tids):
    mfcc_list = []
    try:
        for n, tid in enumerate(tids):
            tid, mfcc= compute_features(tid)
            mfcc_list.append(mfcc)
            print("Extracted features audio track", n)
    except Exception as e:
        print('{}: {}'.format(tid, repr(e)))

    return np.array(mfcc_list)

def compute_features(tid):
    threshold = 1278900
    try:
        filepath = get_audio_path('music/music_training', tid)
        x, sr = librosa.load(filepath, sr=None, mono=True, duration=29.0)  # kaiser_fast
        x = x.tolist()
        if(len(x) < threshold):
            raise ValueError('song length is shorter than threshold')
        else:
            x = x[:int(1278900)]#131000
        x = np.array(x)
        stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
        mel = librosa.feature.melspectrogram(sr=sr, S=stft ** 2)
        mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)

    except Exception as e:
        print('{}: {}'.format(tid, repr(e)))
        return tid, 0

    return tid, mfcc.tolist()

def get_audio_path(audio_dir, track_id):
    tid_str = '{:06d}'.format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')



In [114]:
#Extract Training Sets

metadata_path = 'dataset/track_metadata.csv'
label_column_name = 'track_genre_top'
is_train_mode = True
label_dict = {'Electronic': 6,
 'Experimental': 4,
 'Folk': 7,
 'Hip-Hop': 2,
 'Instrumental': 3,
 'International': 1,
 'Pop': 5,
 'Rock': 0}

metadata_df = pd.read_csv(metadata_path)
if is_train_mode:
    metadata_df = metadata_df[metadata_df['set_split'] == 'training']
else:
    metadata_df = metadata_df[metadata_df['set_split'] == 'validation']
track_ids = np.array(metadata_df['track_id'])
tn = len(track_ids)
mfcc = get_features(track_ids)

########IMPORTANT############ Modify Training Sets

split = 10
n = mfcc.shape[0]
h = mfcc.shape[1]
w = mfcc.shape[2]//split
mfcce = np.zeros((n*split, h, w))
for i in range(split):
    mfcce[i*n:(i+1)*n, :, :w] = mfcc[:,:,i*w:(i+1)*w]
label_array = np.zeros((metadata_df.shape[0]*split, len(label_dict)))
labels = metadata_df[label_column_name].values
for j in range(split):
    for i, label in enumerate(labels):
        label_pos = label_dict.get(label)
        label_array[j*tn + i, label_pos] = 1
ys = label_array.copy()

Extracted features audio track 0
Extracted features audio track 1
Extracted features audio track 2
Extracted features audio track 3
Extracted features audio track 4
Extracted features audio track 5
Extracted features audio track 6
Extracted features audio track 7
Extracted features audio track 8
Extracted features audio track 9
Extracted features audio track 10
Extracted features audio track 11
Extracted features audio track 12
Extracted features audio track 13
Extracted features audio track 14
Extracted features audio track 15
Extracted features audio track 16
Extracted features audio track 17
Extracted features audio track 18
Extracted features audio track 19
Extracted features audio track 20
Extracted features audio track 21
Extracted features audio track 22
Extracted features audio track 23
Extracted features audio track 24
Extracted features audio track 25
Extracted features audio track 26
Extracted features audio track 27
Extracted features audio track 28
Extracted features audio

In [115]:
#Save as pkl
with open('mfcce.pkl', 'wb') as mfccf:
    pickle.dump(mfcce, mfccf, protocol=pickle.HIGHEST_PROTOCOL)
with open('ye.pkl', 'wb') as yf:
    pickle.dump(ys, yf, protocol=pickle.HIGHEST_PROTOCOL)

In [116]:
# Load Training Sets

class MacOSFile(object):
    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        if n >= (1 << 31):
            buffer = bytearray(n)
            pos = 0
            while pos < n:
                size = min(n - pos, 1 << 31 - 1)
                chunk = self.f.read(size)
                buffer[pos:pos + size] = chunk
                pos += size
            return buffer
        return self.f.read(n)

with open("mfcce.pkl", 'rb') as mfccf:
	Xd = pickle.load(MacOSFile(mfccf))
with open("ye.pkl", 'rb') as yf:
	yd = pickle.load(yf)

In [123]:
###############Extract Validation/Test Sets###############

is_train_mode = False

metadata_df = pd.read_csv(metadata_path)
if is_train_mode:
    metadata_df = metadata_df[metadata_df['set_split'] == 'training']
else:
    metadata_df = metadata_df[metadata_df['set_split'] == 'validation'] ######################## Modify here to insert test index
track_ids_val = np.array(metadata_df['track_id'])
vn = len(track_ids_val)
mfcc_val = get_features(track_ids_val)

########IMPORTANT############ Modify Training Sets

split = 10
n = mfcc_val.shape[0]
h = mfcc_val.shape[1]
w = mfcc_val.shape[2]//split
mfcce_val = np.zeros((n*split, h, w))
for i in range(split):
    mfcce_val[i*n:(i+1)*n, :, :w] = mfcc_val[:,:,i*w:(i+1)*w]
label_array_val = np.zeros((metadata_df.shape[0]*split, len(label_dict)))
labels = metadata_df[label_column_name].values
for j in range(split):
    for i, label in enumerate(labels):
        label_pos = label_dict.get(label)
        label_array_val[j*vn + i, label_pos] = 1
ys_val = label_array_val.copy()


Extracted features audio track 0
Extracted features audio track 1
Extracted features audio track 2
Extracted features audio track 3
Extracted features audio track 4
Extracted features audio track 5
Extracted features audio track 6
Extracted features audio track 7
Extracted features audio track 8
Extracted features audio track 9
Extracted features audio track 10
Extracted features audio track 11
Extracted features audio track 12
Extracted features audio track 13
Extracted features audio track 14
Extracted features audio track 15
Extracted features audio track 16
Extracted features audio track 17
Extracted features audio track 18
Extracted features audio track 19
Extracted features audio track 20
Extracted features audio track 21
Extracted features audio track 22
Extracted features audio track 23
Extracted features audio track 24
Extracted features audio track 25
Extracted features audio track 26
Extracted features audio track 27
Extracted features audio track 28
Extracted features audio

Extracted features audio track 238
Extracted features audio track 239
Extracted features audio track 240
Extracted features audio track 241
Extracted features audio track 242
Extracted features audio track 243
Extracted features audio track 244
Extracted features audio track 245
Extracted features audio track 246
Extracted features audio track 247
Extracted features audio track 248
Extracted features audio track 249
Extracted features audio track 250
Extracted features audio track 251
Extracted features audio track 252
Extracted features audio track 253
Extracted features audio track 254
Extracted features audio track 255
Extracted features audio track 256
Extracted features audio track 257
Extracted features audio track 258
Extracted features audio track 259
Extracted features audio track 260
Extracted features audio track 261
Extracted features audio track 262
Extracted features audio track 263
Extracted features audio track 264
Extracted features audio track 265
Extracted features a

Extracted features audio track 473
Extracted features audio track 474
Extracted features audio track 475
Extracted features audio track 476
Extracted features audio track 477
Extracted features audio track 478
Extracted features audio track 479
Extracted features audio track 480
Extracted features audio track 481
Extracted features audio track 482
Extracted features audio track 483
Extracted features audio track 484
Extracted features audio track 485
Extracted features audio track 486
Extracted features audio track 487
Extracted features audio track 488
Extracted features audio track 489
Extracted features audio track 490
Extracted features audio track 491
Extracted features audio track 492
Extracted features audio track 493
Extracted features audio track 494
Extracted features audio track 495
Extracted features audio track 496
Extracted features audio track 497
Extracted features audio track 498
Extracted features audio track 499
Extracted features audio track 500
Extracted features a

Extracted features audio track 708
Extracted features audio track 709
Extracted features audio track 710
Extracted features audio track 711
Extracted features audio track 712
Extracted features audio track 713
Extracted features audio track 714
Extracted features audio track 715
Extracted features audio track 716
Extracted features audio track 717
Extracted features audio track 718
Extracted features audio track 719
Extracted features audio track 720
Extracted features audio track 721
Extracted features audio track 722
Extracted features audio track 723
Extracted features audio track 724
Extracted features audio track 725
Extracted features audio track 726
Extracted features audio track 727
Extracted features audio track 728
Extracted features audio track 729
Extracted features audio track 730
Extracted features audio track 731
Extracted features audio track 732
Extracted features audio track 733
Extracted features audio track 734
Extracted features audio track 735
Extracted features a

In [124]:
#Save as pkl
with open('mfcce_val.pkl', 'wb') as mfccf_val:
    pickle.dump(mfcce_val, mfccf_val, protocol=pickle.HIGHEST_PROTOCOL)
with open('ye_val.pkl', 'wb') as yf_val:
    pickle.dump(ys_val, yf_val, protocol=pickle.HIGHEST_PROTOCOL)

In [125]:
#Load Test Sets

with open("mfcce_val.pkl", 'rb') as mfcc_val:
	Xd_val = pickle.load(mfcc_val)
with open("ye_val.pkl", 'rb') as ys_val:
	yd_val = pickle.load(ys_val)

In [None]:
## UNCOMMENT if you already have mfcc20.pkl for test sets

# ### Load Test Sets
# with open("nameofyourmfcc20.pkl", 'rb') as mfcc_valf:
# 	mfcc_val = pickle.load(mfcc_valf)

# ### MODIFY Test Sets
# split = 10
# n = mfcc_val.shape[0]
# h = mfcc_val.shape[1]
# w = mfcc_val.shape[2]//split
# mfcce_val = np.zeros((n*split, h, w))
# for i in range(split):
#     mfcce_val[i*n:(i+1)*n, :, :w] = mfcc_val[:,:,i*w:(i+1)*w]
# Xd_val = mfcce_val

In [98]:
# Model for training and validation

tf.reset_default_graph()

n_input = 20 * 249 
n_classs = 8 
image_height = 20
image_width = 249
# convolutional layer property
conv_size = 3
n_filter1 = 32
n_filter2 = 64
n_filter3 = 128
# pooling layer property
pool_size = 2
# fully-connected layer property
fc_dim = 512

# Placeholder and variables
# TODO : declare placeholder and variables

X = tf.placeholder(tf.float32, [None, image_height, image_width])
y = tf.placeholder(tf.int64, [None, n_classs])
is_training = tf.placeholder(tf.bool)

# Build model
# TODO : build your model here
# Model

def model(X,y,is_training):
    regularizer = tf.contrib.layers.l2_regularizer(1e-4)
    activation = tf.nn.relu
    init = tf.contrib.layers.xavier_initializer()
    
    w_fc = tf.get_variable("W_fc", shape=[8*n_filter3, fc_dim])
    b_fc = tf.get_variable("b_fc", shape=[fc_dim])
    w_out = tf.get_variable("W_out", shape=[fc_dim, n_classs])
    b_out = tf.get_variable("b_out", shape=[n_classs])
    
    x_reshaped = tf.reshape(X, [-1, image_height, image_width, 1])
    c11 = tf.layers.conv2d(x_reshaped, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c12 = tf.layers.conv2d(c11, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp1 = tf.layers.max_pooling2d(c12, pool_size, 2,  "same")
    c21 = tf.layers.conv2d(mp1, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c22 = tf.layers.conv2d(c21, n_filter1, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp2 = tf.layers.max_pooling2d(c22, pool_size, 2,  "same")
    c31 = tf.layers.conv2d(mp2, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c32 = tf.layers.conv2d(c31, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp3 = tf.layers.max_pooling2d(c32, pool_size, 2,  "same")
    c41 = tf.layers.conv2d(mp3, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c42 = tf.layers.conv2d(c41, n_filter2, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp4 = tf.layers.max_pooling2d(c42, pool_size, 2, "same")
    c51 = tf.layers.conv2d(mp4, n_filter3, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    c52 = tf.layers.conv2d(c51, n_filter3, conv_size, 1, "same", activation = activation, kernel_initializer = init, kernel_regularizer = regularizer)
    mp5 = tf.layers.max_pooling2d(c52, pool_size, 2, "same")
    if is_training is not None:
        mp5 = tf.nn.dropout(mp5, 0.5)    
    flat = tf.reshape(mp5, [-1, 8*n_filter3])
    fc = tf.nn.relu(tf.matmul(flat, w_fc) + b_fc)
    y_out = tf.matmul(fc, w_out) + b_out
    return y_out

y_out = model(X,y,is_training)

# Loss and optimizer
# TODO : declare loss and optimizer operation

total_loss = tf.losses.softmax_cross_entropy(y,logits=y_out) 
mean_loss = tf.reduce_mean(total_loss)
optimizer = tf.train.AdamOptimizer(1e-4) 
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
    train_step = optimizer.minimize(mean_loss)    
correct_prediction = tf.equal(tf.argmax(y_out,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))



In [126]:
# properties
# General
# TODO : declare additional properties
# not fixed (change or add property as you like)
batch_size = 5
epoch_num = 5
print_every = 10

# fixed
metadata_path = 'dataset/track_metadata.csv'
# True if you want to train, False if you already trained your model
# TODO : IMPORTANT !!! Please change it to False when you submit your code
is_train_mode = False
validation = True
# TODO : IMPORTANT !!! Please specify the path where your best model is saved
# example : checkpoint/run-0925-0348
checkpoint_path = 'checkpoint/cnn'
# 'track_genre_top' for project 1, 'listens' for project 2
label_column_name = 'track_genre_top'

# X, y, mean_loss,correct_prediction,train_step, accuracy = my_model(lr = 2e-4, rl = 1e-4,  is_training= is_train_mode)
#load data

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess, checkpoint_path)			
    if is_train_mode:
        variables = [mean_loss,correct_prediction,train_step]
        train_indicies = np.arange(Xd.shape[0])
        np.random.shuffle(train_indicies)
        iter_cnt = 0
        for e in range(epoch_num):
            correct = 0
            losses = []
            for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
                start_idx = (i*batch_size)%Xd.shape[0]
                idx = train_indicies[start_idx:start_idx+batch_size]
                feed_dict = {X: Xd[idx,:],
                             y: yd[idx] }
                actual_batch_size = yd[idx].shape[0]
                loss, corr, _ = sess.run(variables,feed_dict=feed_dict)
                losses.append(loss*actual_batch_size)
                correct += np.sum(corr)
                if is_train_mode and (iter_cnt % print_every) == 0:
                    print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
                          .format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
                iter_cnt += 1
            total_correct = correct/Xd.shape[0]
            total_loss = np.sum(losses)/Xd.shape[0]
            print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}"\
              .format(total_loss,total_correct,e+1))
            plt.plot(losses)
            plt.grid(True)
            plt.title('Epoch {} Loss'.format(e+1))
            plt.xlabel('minibatch number')
            plt.ylabel('minibatch loss')
            plt.show()
        print('Training finished !')
#         output_dir = checkpoint_path + '/run-%02d%02d-%02d%02d' % tuple(localtime(time()))[1:5]
        output_dir = checkpoint_path
        if not gfile.Exists(output_dir):
            gfile.MakeDirs(output_dir)
        saver.save(sess, output_dir)
        print('Model saved in file : %s' % output_dir)

    if validation:
        correct = 0
        losses = []
        preds = np.zeros([1,8])
        train_indicies = np.arange(Xd_val.shape[0])
        variables = [mean_loss,correct_prediction,y_out]
        for j in range(int(math.ceil(Xd_val.shape[0]/batch_size))):
            start_idx = (j*batch_size)%Xd_val.shape[0]
            idx = train_indicies[start_idx:start_idx+batch_size]
            feed_dict_val = {X: Xd_val[idx, :],
                         y: yd_val[idx] }
            actual_batch_size = yd_val[idx].shape[0]
            loss, corr, predict = sess.run(variables,feed_dict=feed_dict_val)
            preds = np.concatenate((preds, predict), axis = 0)
            losses.append(loss*actual_batch_size)
            correct += np.sum(corr)
        votes = np.zeros([Xd_val.shape[0]//10, 8])
        for n, i in enumerate(preds[1:]):
            votes[n%(Xd_val.shape[0]//10),np.argmax(i)] += 1
        total_val_correct = correct/Xd_val.shape[0]
        total_val_loss = np.sum(losses)/Xd_val.shape[0]
        sum_total_val_correct = sum(((np.argmax(votes, axis = 1) == np.argmax(yd_val[:Xd_val.shape[0]//10], axis = 1))))/(Xd_val.shape[0]//10)
        print("Validation loss, Overall loss = {0:.3g}, accuracy of {1:.3g}"\
          .format(total_val_loss,sum_total_val_correct))


INFO:tensorflow:Restoring parameters from checkpoint/cnn
Validation loss, Overall loss = 3.84, accuracy of 0.513
