# CNN for IoSL

This notebook includes code for training a CNN to classify songs according to their intelligibility to one of three classes (High, Moderate, Low).

The CNN is trained on the melspectrogram of the whole song. Songs are padded to a length of (x) where x is the longest file in the dataset.

Requirements: 
- Keras
- Librosa 

In [27]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers import Input, Dense, TimeDistributed, LSTM, Dropout, Activation
from keras.layers import Conv2D, MaxPooling2D, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ELU
from keras.callbacks import ModelCheckpoint
from keras import backend
from keras.utils import np_utils
import numpy as np
import librosa
import librosa.display
import fnmatch
import os
from os.path import isfile
import csv
from shutil import copyfile

Initializing parameters. 

__Note:__ Change directories accordingly. 

In [2]:
batch_size = 32
num_classes = 3
epochs = 10
data_augmentation = False
num_predictions = 3
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'keras_IoSL_trained_model.h5'
# Directory of the dataset
datasetDir = "/Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/"
# Directory of the saved melspectrogram
outpath = "/Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Mels/"

The following code organizes the dataset in three seperate folders depending on their class. It also computes the melspectrogram for each file and save it similarily into seperate folders. 

In [21]:
# Preprocessing and saving mels
# Copying files to the three classes folders
def load_labels(labelsDir = '/Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/labels_genres.csv'):
    name,label,genre = [],[],[]
    with open(labelsDir) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        for row in readCSV:
            name.append(row[0])
            label.append(row[1])
            genre.append(row[2])
    return name,label,genre

def copy_files_to_directoriesClasses(directory = "/Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/FullDataset/"
):
    name,label,genre = load_labels()
    files = fnmatch.filter(os.listdir(directory), '*.wav')
    for filename in files:
        idx = name.index(filename)
        score = float(label[idx])
        if score < 0.33:
            copyfile(directory + filename, datasetDir + "Low/" + '[' + str("{0:.2f}".format(score)) + ']' + filename)
        elif score < 0.66:
            copyfile(directory + filename, datasetDir + "Moderate/" + '[' + str("{0:.2f}".format(score)) + ']' + filename)
        else:
            copyfile(directory + filename, datasetDir + "High/" + '[' + str("{0:.2f}".format(score)) + ']' + filename)

def get_class_names(path=datasetDir):  # class names are subdirectory names in Samples/ directory
    class_names = os.listdir(path)
    return class_names

def preprocess_dataset(inpath=datasetDir, outpath=outpath):
    class_names = get_class_names(path=inpath)   # get the names of the subdirectories
    nb_classes = len(class_names)
    print("class_names = ",class_names)
    for idx, classname in enumerate(class_names):   # go through the subdirs

        if not os.path.exists(outpath+classname):
            os.mkdir( outpath+classname );   # make a new subdirectory for preproc class

        class_files = files = fnmatch.filter(os.listdir(inpath+classname), '*.wav')
        n_files = len(class_files)
        n_load = n_files
        print(' class name = {:14s} - {:3d}'.format(classname,idx),
            ", ",n_files," files in this class",sep="")

        printevery = 20
        for idx2, infilename in enumerate(class_files):
            audio_path = inpath + classname + '/' + infilename
            if (0 == idx2 % printevery):
                print('\r Loading class: {:14s} ({:2d} of {:2d} classes)'.format(classname,idx+1,nb_classes),
                       ", file ",idx2+1," of ",n_load,": ",audio_path,sep="")
            #start = timer()
            aud, sr = librosa.load(audio_path, sr=None)
            """
            Padding Audio to max size [705601] (Calculated seperatly on the longest file in this dataset, around 16 Sec)
            """
            paddedAud = np.zeros(705601)
            paddedAud[0:len(aud),] = aud
            melgram = librosa.amplitude_to_db(librosa.feature.melspectrogram(paddedAud, sr=sr, n_mels=96))[np.newaxis,np.newaxis,:,:]
            outfile = outpath + classname + '/' + infilename+'.npy'
            np.save(outfile,melgram)

Load the melspectrogram files and parition it to train/test. 

In [4]:
def get_class_names(path="Preproc/"):  # class names are subdirectory names in Preproc/ directory
    class_names = os.listdir(path)
    return class_names

def get_total_files(path="Preproc/",train_percentage=0.8): 
    sum_total = 0
    sum_train = 0
    sum_test = 0
    subdirs = os.listdir(path)
    for subdir in subdirs:
        files = os.listdir(path+subdir)
        n_files = len(files)
        sum_total += n_files
        n_train = int(train_percentage*n_files)
        n_test = n_files - n_train
        sum_train += n_train
        sum_test += n_test
    return sum_total, sum_train, sum_test

def get_sample_dimensions(path='Preproc/'):
    classname = os.listdir(path)[0]
    files = os.listdir(path+classname)
    infilename = files[0]
    audio_path = path + classname + '/' + infilename
    melgram = np.load(audio_path)
    print("   get_sample_dimensions: melgram.shape = ",melgram.shape)
    return melgram.shape

def encode_class(class_name, class_names):  # makes a "one-hot" vector for each class name called
    try:
        idx = class_names.index(class_name)
        vec = np.zeros(len(class_names))
        vec[idx] = 1
        return vec
    except ValueError:
        return None

def shuffle_XY_paths(X,Y,paths):   # generates a randomized order, keeping X&Y(&paths) together
    assert (X.shape[0] == Y.shape[0] )
    idx = np.array(range(Y.shape[0]))
    np.random.shuffle(idx)
    newX = np.copy(X)
    newY = np.copy(Y)
    newpaths = paths
    for i in range(len(idx)):
        newX[i] = X[idx[i],:,:]
        newY[i] = Y[idx[i],:]
        newpaths[i] = paths[idx[i]]
    return newX, newY, newpaths
    
def get_total_files(path=outpath,train_percentage=0.8): 
    sum_total = 0
    sum_train = 0
    sum_test = 0
    subdirs = os.listdir(path)
    for subdir in subdirs:
        files = os.listdir(path+subdir)
        n_files = len(files)
        sum_total += n_files
        n_train = int(train_percentage*n_files)
        n_test = n_files - n_train
        sum_train += n_train
        sum_test += n_test
    return sum_total, sum_train, sum_test

total_files, total_train, total_test = get_total_files(path=outpath, train_percentage=0.8)
print("total files = ",total_files)

total files =  215


Use all previous functions to load dataset from hardrive to memory

In [5]:
def build_datasets(train_percentage=0.8, preproc=False):
    if (preproc):
        path = outpath
    else:
        path = datasetDir
        
    class_names = get_class_names(path=path)
    print("class_names = ",class_names)

    total_files, total_train, total_test = get_total_files(path=path, train_percentage=train_percentage)
    print("total files = ",total_files)

    nb_classes = len(class_names)

    # pre-allocate memory for speed (old method used np.concatenate, slow)
    mel_dims = get_sample_dimensions(path=path)  # Find out the 'shape' of each data file
    X_train = np.zeros((total_train, mel_dims[1], mel_dims[2], mel_dims[3]))   
    Y_train = np.zeros((total_train, nb_classes))  
    X_test = np.zeros((total_test, mel_dims[1], mel_dims[2], mel_dims[3]))  
    Y_test = np.zeros((total_test, nb_classes))  
    paths_train = []
    paths_test = []

    train_count = 0
    test_count = 0
    for idx, classname in enumerate(class_names):
        this_Y = np.array(encode_class(classname,class_names) )
        this_Y = this_Y[np.newaxis,:]
        class_files = os.listdir(path+classname)
        n_files = len(class_files)
        n_load =  n_files
        n_train = int(train_percentage * n_load)
        printevery = 100
        print("")
        for idx2, infilename in enumerate(class_files[0:n_load]):          
            audio_path = path + classname + '/' + infilename
            if (0 == idx2 % printevery):
                print('\r Loading class: {:14s} ({:2d} of {:2d} classes)'.format(classname,idx+1,nb_classes),
                       ", file ",idx2+1," of ",n_load,": ",audio_path,sep="")
            #start = timer()
            if (preproc):
              melgram = np.load(audio_path)
              sr = 44100
            else:
              aud, sr = librosa.load(audio_path, mono=mono,sr=None)
              melgram = librosa.logamplitude(librosa.feature.melspectrogram(aud, sr=sr, n_mels=96),ref_power=1.0)[np.newaxis,np.newaxis,:,:]

            melgram = melgram[:,:,:,0:mel_dims[3]]   # just in case files are differnt sizes: clip to first file size
       
            #end = timer()
            #print("time = ",end - start) 
            if (idx2 < n_train):
                # concatenate is SLOW for big datasets; use pre-allocated instead
                #X_train = np.concatenate((X_train, melgram), axis=0)  
                #Y_train = np.concatenate((Y_train, this_Y), axis=0)
                X_train[train_count,:,:] = melgram
                Y_train[train_count,:] = this_Y
                paths_train.append(audio_path)     # list-appending is still fast. (??)
                train_count += 1
            else:
                X_test[test_count,:,:] = melgram
                Y_test[test_count,:] = this_Y
                #X_test = np.concatenate((X_test, melgram), axis=0)
                #Y_test = np.concatenate((Y_test, this_Y), axis=0)
                paths_test.append(audio_path)
                test_count += 1
        print("")

    print("Shuffling order of data...")
    X_train, Y_train, paths_train = shuffle_XY_paths(X_train, Y_train, paths_train)
    X_test, Y_test, paths_test = shuffle_XY_paths(X_test, Y_test, paths_test)

    return X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr


Define model parameters 

In [6]:
def build_model(X,Y,nb_classes):
    nb_filters = 32  # number of convolutional filters to use
    pool_size = (2, 2)  # size of pooling area for max pooling
    kernel_size = (3, 3)  # convolution kernel size
    nb_layers = 2
    input_shape = (1, X.shape[2], X.shape[3])

    model = Sequential()
    model.add(Conv2D(nb_filters, kernel_size, strides=(1, 1),
                        padding='valid', input_shape=input_shape,data_format="channels_first"))
    model.add(BatchNormalization(axis=1))
    model.add(Activation('relu'))

    for layer in range(nb_layers-1):
        model.add(Conv2D(nb_filters, kernel_size, strides=(1, 1),data_format="channels_first"))
        model.add(BatchNormalization(axis=1))
        model.add(ELU(alpha=1.0))  
        model.add(MaxPooling2D(pool_size=pool_size))
        model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(24))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation("softmax"))
    return model
    

Call previous functions

In [25]:
# get the data

preprocess_dataset() #==> run to get mels
X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr = build_datasets(preproc=True)

# make the model
model = build_model(X_train,Y_train, nb_classes=len(class_names))
model.compile(loss='categorical_crossentropy',
          optimizer='adadelta',
          metrics=['accuracy'])
model.summary()

class_names =  ['High', 'Low', 'Moderate']
 class name = High           -   0, 92 files in this class
 Loading class: High           ( 1 of  3 classes), file 1 of 92: /Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/High/[0.67]excerpt1_Blessed_By_A_Broken_Heart_Show_Me_What_You_Got.wav
 Loading class: High           ( 1 of  3 classes), file 21 of 92: /Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/High/[0.74]excerpt1_Bobby Darin_Splish Splash.wav
 Loading class: High           ( 1 of  3 classes), file 41 of 92: /Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/High/[0.79]excerpt1_Ian Sylvia_Tomorrow Is A Long Time.wav
 Loading class: High           ( 1 of  3 classes), file 61 of 92: /Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/High/[0.87]excerpt1_Kelly Osbourne_Right Here.wav
 Loading class: High           ( 1 of  3 classes), file 81 of 92: /Users/KarimM/GoogleDrive/PhD/Research/IoSLDataset/Dataset_Labels/High/[0.9

__Train Model__

In [26]:
# Initialize weights using checkpoint if it exists. (Checkpointing requires h5py)
load_checkpoint = False
checkpoint_filepath = 'weights.hdf5'
if (load_checkpoint):
    print("Looking for previous weights...")
    if ( isfile(checkpoint_filepath) ):
        print ('Checkpoint file detected. Loading weights.')
        model.load_weights(checkpoint_filepath)
    else:
        print ('No checkpoint file detected.  Starting from scratch.')
else:
    print('Starting from scratch (no checkpoint)')
checkpointer = ModelCheckpoint(filepath=checkpoint_filepath, verbose=1, save_best_only=True)

# train and score the model
batch_size = 128
nb_epoch = 5
model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch,
      verbose=1, validation_data=(X_test, Y_test), callbacks=[checkpointer])
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Starting from scratch (no checkpoint)
Train on 170 samples, validate on 44 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 8.72919, saving model to weights.hdf5
Epoch 2/5

Epoch 00002: val_loss improved from 8.72919 to 8.39795, saving model to weights.hdf5
Epoch 3/5

Epoch 00003: val_loss did not improve from 8.39795
Epoch 4/5

Epoch 00004: val_loss did not improve from 8.39795
Epoch 5/5

Epoch 00005: val_loss did not improve from 8.39795
Test score: 9.89064953543923
Test accuracy: 0.38636363365433435
