In [None]:
import os
import glob
import math
import pandas as pd
import librosa as lbr
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import OneHotEncoder
from IPython.display import Audio
from reservoirpy.nodes import ESN
from sklearn.metrics import accuracy_score


In [1]:
audio = Audio(filename="./static/song.wav")

FileNotFoundError: [Errno 2] No such file or directory: './static/song.wav'

In [None]:
display(audio)

In [None]:
im = plt.imread("./static/canary_outputs.png")
plt.figure(figsize=(15, 15)); plt.imshow(im); plt.axis('off'); plt.show()

In [None]:
win_length = 1024
n_fft = 2048
hop_length = 512
fmin = 500
fmax = 8000
lifter = 40
n_mfcc = 13


def load_data(directory, max_songs=450):
    audios = sorted(glob.glob(directory + "/**/*.wav", recursive=True))
    annotations = sorted(glob.glob(directory + "/**/*.csv", recursive=True))


    X = []
    Y = []
    vocab = set()

    max_songs = min(len(audios), max_songs)

    for audio, annotation, _ in tqdm(zip(audios, annotations, range(max_songs)), total=max_songs):
        df = pd.read_csv(annotation)
        wav, rate = lbr.load(audio, sr=None)
        x = lbr.feature.mfcc(y=wav, sr=rate,
                              win_length=win_length, hop_length=hop_length,
                              n_fft=n_fft, fmin=fmin, fmax=fmax, lifter=lifter,
                              n_mfcc=n_mfcc)
        delta = lbr.feature.delta(x, mode="wrap")
        delta2 = lbr.feature.delta(x, order=2, mode="wrap")

        X.append(np.vstack([x, delta, delta2]).T)

        y = [["SIL"]] * x.shape[1]

        for annot in df.itertuples():
            start = max(0, round(annot.start * rate / hop_length))
            end = min(x.shape[1], round(annot.end * rate / hop_length))
            y[start:end] = [[annot.syll]] * (end - start)
            vocab.add(annot.syll)

        Y.append(y)

    return X, Y, list(vocab)

X, Y, vocab = load_data("./canary-data")

In [None]:
one_hot = OneHotEncoder(categories=[vocab], sparse=False)

Y = [one_hot.fit_transform(np.array(y)) for y in Y]

In [None]:
X_train, y_train = X[:-10], Y[:-10]
X_test, y_test = X[-10:], Y[-10:]

In [None]:
units = 1000
leak_rate = 0.05
spectral_radius = 0.5
inputs_scaling = 0.001
connectivity = 0.1
input_connectivity = 0.1
regularization = 1e-5
seed = 1234


reservoir = Reservoir(units, sr=spectral_radius,
                      lr=leak_rate, rc_connectivity=connectivity,
                      input_connectivity=input_connectivity, seed=seed)

readout = Ridge(ridge=regularization)


esn = ESN(reservoir=reservoir, readout=readout, workers=-1)

In [None]:
esn = esn.fit(X_train, y_train)

In [None]:
outputs = esn.run(X_test)

In [None]:
scores = []
for y_t, y_p in zip(y_test, outputs):
    targets = np.vstack(one_hot.inverse_transform(y_t)).flatten()

    top_1 = np.argmax(y_p, axis=1)
    top_1 = np.array([vocab[t] for t in top_1])

    accuracy = accuracy_score(targets, top_1)

    scores.append(accuracy)

In [None]:
scores

In [None]:
print("Average accuracy :", f"{np.mean(scores):.4f}", "±", f"{np.std(scores):.5f}")