In [176]:
import numpy as np
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import BatchNormalization
from keras.utils import np_utils
from keras import regularizers
from keras.engine.topology import Layer
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import matplotlib
matplotlib.use("Agg")
from matplotlib import pyplot as plt
import itertools

In [177]:
song_labels = ["Blues","Classical","Country","Disco","Hip hop","Jazz","Metal","Pop","Reggae","Rock"]

In [178]:
def metric(y_true, y_pred):
    return K.mean(K.equal(K.argmax(y_true, axis=1), K.argmax(y_pred, axis=1)))

def cnn(num_genres=10, input_shape=(64,173,1)):
    model = Sequential()
    model.add(Conv2D(64, kernel_size=(4, 4),
                     activation='relu', #kernel_regularizer=regularizers.l2(0.04),
                     input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 4)))
    model.add(Conv2D(64, (3, 5), activation='relu'
                    , kernel_regularizer=regularizers.l2(0.04)
                    ))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (2, 2), activation='relu'
       # , kernel_regularizer=regularizers.l2(0.04)
        ))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.04)))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.04)))
    model.add(Dense(num_genres, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                  metrics=[metric])
    return(model)

In [179]:
class model(object):

    def __init__(self, ann_model):
        self.model = ann_model()

    def train_model(self, train_x, train_y,
                val_x=None, val_y=None,
                small_batch_size=200, max_iteration=300, print_interval=1,
                test_x=None, test_y=None):

        m = len(train_x)

        for it in range(max_iteration):

            # split training data into even batches
            batch_idx = np.random.permutation(m)
            train_x = train_x[batch_idx]
            train_y = train_y[batch_idx]

            num_batches = int(m / small_batch_size)
            for batch in range(num_batches):

                x_batch = train_x[ batch*small_batch_size : (batch+1)*small_batch_size]
                y_batch = train_y[ batch*small_batch_size : (batch+1)*small_batch_size]
                print("starting batch\t", batch, "\t Epoch:\t", it)
                self.model.train_on_batch(x_batch, y_batch)

            if it % print_interval == 0:
                ann.model.save_weights('./saved_models/iter='+ str(it))
                validation_accuracy = self.model.evaluate(val_x, val_y)
                training_accuracy = self.model.evaluate(train_x, train_y)
                testing_accuracy = self.model.evaluate(test_x, test_y)
                # print of test error used only after development of the model
                print("\nTraining accuracy: %f\t Validation accuracy: %f\t Testing Accuracy: %f" %
                      (training_accuracy[1], validation_accuracy[1], testing_accuracy[1]))
                print("\nTraining loss: %f    \t Validation loss: %f    \t Testing Loss: %f \n" %
                      (training_accuracy[0], validation_accuracy[0], testing_accuracy[0]))
                print( )

            if (validation_accuracy[1] > .81):
                print("Saving confusion data...")
                model_name = "model" + str(100*validation_accuracy[1]) + str(100*testing_accuracy[1]) + ".h5"
                self.model.save(model_name) 
                pred = self.model.predict_classes(test_x, verbose=1)
                cnf_matrix = confusion_matrix(np.argmax(test_y, axis=1), pred)
                np.set_printoptions(precision=2)
                plt.figure()
                plot_confusion_matrix(cnf_matrix, classes=song_labels, normalize=True, title='Normalized confusion matrix')
                print(precision_recall_fscore_support(np.argmax(test_y, axis=1),pred, average='macro')) 
                plt.savefig(str(batch))

In [180]:
ann_latest = model(cnn)
ann_latest.model.load_weights('model82.812569.53125.h5')
#not relevant when only using the model. 
#validation_accuracy = ann_latest.model.evaluate(x_cv, y_cv)
#training_accuracy = ann_latest.model.evaluate(x_tr, y_tr)
#testing_accuracy = ann_latest.model.evaluate(x_te, y_te)
#print(training_accuracy, validation_accuracy, testing_accuracy)

In [173]:
import librosa as lb 
import matplotlib.pyplot as plt

In [174]:
SR = 22050
N_FFT = 512
HOP_LENGTH = N_FFT // 2
N_MELS = 64 

def log_melspectrogram(data, log=True, plot=False, num='', genre=""):

	melspec = lb.feature.melspectrogram(y=data, hop_length = HOP_LENGTH, n_fft = N_FFT, n_mels = N_MELS)

	if log:
		melspec = lb.power_to_db(melspec**2)

	if plot:
		melspec = melspec[np.newaxis, :]
		plt.imshow(melspec.reshape((melspec.shape[1],melspec.shape[2])))
		plt.savefig('melspec'+str(num)+'_'+str(genre)+'.png')

	return melspec

In [175]:
def batch_log_melspectrogram(data_list, log=True, plot=False):
	melspecs = np.asarray([log_melspectrogram(data_list[i],log=log,plot=plot) for i in range(len(data_list))])
	return melspecs

# How to get classifications for your files:

1. Put all the audio files in one folder
2. set the variable folder_path to that directory 
3. Run the following code. Variable preds will have the classification for all the audio files. 
4. Preds contains indices of the labels. The indices correspond to the list song_labels. So a prediction of 5 corresponds to Jazz, which is song_labels[5]. 

In [162]:
import os
import random
from scipy import stats

N_samples = 5
sample_seconds = 2
folder_path = 'genres/genres/rock/'


preds = []
for filename in os.listdir(folder_path):
    x,sr = lb.load(os.path.join(folder_path, filename), sr=SR)
    sample_opts = range(round(x.shape[0]/SR)-sample_seconds-1)
    sample_i = random.sample(sample_opts, 5)
    samples = [x[i*SR: i*SR + sample_seconds*SR] for i in sample_i]
    res = batch_log_melspectrogram(samples)
    res_dim1 = np.expand_dims(res, axis=3)
    y_pred = ann_latest.model.predict_classes(res_dim1, verbose=1)
    mode = stats.mode(y_pred).mode[0]
    preds.append(mode)
    print(song_labels[mode])

Rock
Rock
Rock
Rock
Rock
Rock
Country
Rock
Rock
Rock
Rock
Blues
Blues
Blues
Rock
Rock
Metal
Rock
Metal
Rock
Rock
Country
Country
Country
Rock
Rock
Rock
Metal
Rock
Rock
Country
Disco
Rock
Country
Rock
Rock
Country
Rock
Disco
Rock
Blues
Rock
Rock
Rock
Country
Rock
Rock
Metal
Rock
Jazz
Rock
Rock
Blues
Rock
Rock
Rock
Metal
Rock
Rock
Jazz
Rock
Blues
Jazz
Jazz
Country
Rock
Classical
Rock
Rock
Jazz
Jazz
Blues
Rock
Rock
Rock
Rock
Country
Rock
Rock
Rock
Rock
Jazz
Rock
Rock
Rock
Rock
Rock
Rock
Metal
Country
Blues
Rock
Rock
Rock
Metal
Rock
Rock
Rock
Country
Classical


In [170]:
print("Class for each file:", preds)

Class for each file: [9, 9, 9, 9, 9, 9, 2, 9, 9, 9, 9, 0, 0, 0, 9, 9, 6, 9, 6, 9, 9, 2, 2, 2, 9, 9, 9, 6, 9, 9, 2, 3, 9, 2, 9, 9, 2, 9, 3, 9, 0, 9, 9, 9, 2, 9, 9, 6, 9, 5, 9, 9, 0, 9, 9, 9, 6, 9, 9, 5, 9, 0, 5, 5, 2, 9, 1, 9, 9, 5, 5, 0, 9, 9, 9, 9, 2, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 6, 2, 0, 9, 9, 9, 6, 9, 9, 9, 2, 1]
