In [1]:
import numpy as np 
import os
import matplotlib.pyplot as plt
import librosa, librosa.display
import math
import json
from sklearn.model_selection import train_test_split
import keras
from keras.layers import Input, Flatten, Dense, Conv2D, BatchNormalization, LeakyReLU, Dropout, Activation
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## PREPROCESSING AUDIO BASICS

In [None]:
file = 'BR.wav'

sig, sr = librosa.load(file, sr=22050)
librosa.display.waveplot(sig, sr = sr)
plt.xlabel("Time")
plt.ylabel("Amp")
plt.show()

In [None]:
fft = np.fft.fft(sig)
mag = np.abs(fft)
freq = np.linspace(0, sr, len(mag))
hfreq = freq[:int(len(freq)/2)]
hmag = mag[:int(len(mag)/2)]
plt.plot(hfreq,hmag)
plt.xlabel("Hz")
plt.ylabel("Mag")

In [None]:
n_fft = 2048
hop_size = 512

stft = librosa.core.stft(sig, hop_length = hop_size, n_fft = n_fft)
spec = np.abs(stft)
log_spec = librosa.amplitude_to_db(spec)
librosa.display.specshow(log_spec,sr = sr, hop_length = hop_size)
plt.xlabel("Time")
plt.ylabel("Hz")
plt.colorbar()
plt.show()

In [None]:
MFCCs = librosa.feature.mfcc(sig, n_fft = 2048, hop_length = hop_size, n_mfcc = 13)
librosa.display.specshow(MFCCs,sr = sr, hop_length = hop_size)
plt.xlabel("Time")
plt.ylabel("MFCC")
plt.colorbar()
plt.show()



## PREPROCESSING MUSIC GENRE DATASET

In [2]:
DATASET_P = "genres" # using the genre dataset from http://marsyas.info/downloads/datasets.html
JSON_P = "data.json"
SAMPLE_RATE = 22050
DURATION = 30 # seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, n_segments=5):
    data = {
        "mapping": [],
        "mfcc"   : [],
        "label"  : []
    }
    
    num_samples_per_segment = int (SAMPLES_PER_TRACK / n_segments)
    expected_num_mfcc_vecs_per_segment = math.ceil(num_samples_per_segment / hop_length)
    
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):
        if dirpath is not dataset_path:
            dirpath_components = dirpath.split("/")
            semantic_label = dirpath_components[-1]
            data["mapping"].append(semantic_label)
            print("\n Processing {}".format(semantic_label))
            
            for f in filenames:
                file_path = os.path.join(dirpath, f) 
                sig, sr = librosa.load(file_path, sr=SAMPLE_RATE)
                # process segment, extract mfcc, store data
                
                for s in range(n_segments):
                    start_s = num_samples_per_segment * s
                    end_s   = start_s + num_samples_per_segment
                    mfcc = librosa.feature.mfcc(sig[start_s:end_s], sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T
                    if len(mfcc) == expected_num_mfcc_vecs_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["label"].append(i-1) # first iteration is the dataset path, so we ignore it
                        print("{}, segment:{}".format(file_path,s+1))
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)
        

In [None]:
save_mfcc(DATASET_P, JSON_P, n_segments=10)

In [3]:
def load_data(dataset_p):
    with open(dataset_p, "r") as fp:
        data = json.load(fp)
    inputs = np.array(data["mfcc"])
    targets = np.array(data["label"])
    
    return inputs, targets

inputs, targets = load_data(JSON_P)

In [4]:

in_tr, in_test, t_tr, t_test = train_test_split(inputs, targets, test_size =0.3)


## BUILD AND TRAIN THE NETWORK

In [12]:
input_layer = Input((130,13))
x = Flatten()(input_layer)
x = Dense(512)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(256)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(64)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(10)(x)
x = BatchNormalization()(x)
x = Activation('softmax')(x)
output_layer = Dropout(rate=0.3)(x)
model1 = Model(input_layer, output_layer)
opt = Adam(lr=0.0001)
model1.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 130, 13)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 1690)              0         
_________________________________________________________________
dense_29 (Dense)             (None, 512)               865792    
_________________________________________________________________
batch_normalization_9 (Batch (None, 512)               2048      
_________________________________________________________________
activation_21 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 256)               131328    
_________________________________________________________________
batch_normalization_10 (Batc (None, 256)               1024      
__________

In [14]:
model1.fit(in_tr, t_tr, validation_data=(in_test, t_test),epochs=50,batch_size=32,shuffle=True)

Train on 6997 samples, validate on 2999 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x148149c50>

In [15]:
model1.evaluate(in_test, t_test, batch_size=32)



[1.446850855416479, 0.6028676225010973]

In [61]:
GENRES = np.array(['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'])

preds = model1.predict(in_test)
preds_single = GENRES[np.argmax(preds, axis = -1)]
actual_single = GENRES[(t_test)]

In [62]:
indices = np.random.choice(range(len(in_test)), 6)
for i, idx in enumerate(indices):
    print("\npredicted: {}".format(preds_single[idx]))
    print("\n correct: {}".format(actual_single[idx]))


predicted: country

 correct: country

predicted: reggae

 correct: disco

predicted: reggae

 correct: reggae

predicted: pop

 correct: country

predicted: metal

 correct: country

predicted: jazz

 correct: jazz
