In [1]:
import sys,os,signal

import numpy as np
import musicnet

import torch
from torch.autograd import Variable
from torch.nn.functional import conv1d, mse_loss

from time import time

import matplotlib.pyplot as plt
from IPython.display import Audio

from sklearn.metrics import average_precision_score

root = './'

%matplotlib inline

In [2]:
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'   # see issue #152
os.environ['CUDA_VISIBLE_DEVICES']='2'

def worker_init(args):
    signal.signal(signal.SIGINT, signal.SIG_IGN) # ignore signals so parent can handle them
    np.random.seed(os.getpid() ^ int(time())) # approximately random seed for workers

batch_size = 100
kwargs = {'num_workers': 4, 'pin_memory': True, 'worker_init_fn': worker_init}

m = 128
k = 500 # number of frequency bins?
d = 4096 # size of windows?
window = 16384 # total number of audio samples?
stride = 512
regions = 1 + (window - d)//stride # number of output after sliding the window through x

In [3]:
train_set = musicnet.MusicNet(root=root, train=True, download=True, window=window)#, pitch_shift=5, jitter=.1)
test_set = musicnet.MusicNet(root=root, train=False, window=window, epoch_size=50000)

In [4]:
train_loader = torch.utils.data.DataLoader(dataset=train_set,batch_size=batch_size,**kwargs)
test_loader = torch.utils.data.DataLoader(dataset=test_set,batch_size=batch_size,**kwargs)

In [5]:
def create_filters(d,k,low=50,high=6000):
    x = np.linspace(0, 2*np.pi, d, endpoint=False)
    wsin = np.empty((k,1,d), dtype=np.float32)
    wcos = np.empty((k,1,d), dtype=np.float32)
    start_freq = low
    end_freq = high
    num_cycles = start_freq*d/44000.
    scaling_ind = np.log(end_freq/start_freq)/k
    window_mask = 1.0-1.0*np.cos(x)
    for ind in range(k):
        wsin[ind,0,:] = window_mask*np.sin(np.exp(ind*scaling_ind)*num_cycles*x)
        wcos[ind,0,:] = window_mask*np.cos(np.exp(ind*scaling_ind)*num_cycles*x)
    
    return wsin,wcos

In [6]:
wsin,wcos = create_filters(d,k)

with torch.cuda.device(0):
    wsin_var = Variable(torch.from_numpy(wsin).cuda(), requires_grad=False)
    wcos_var = Variable(torch.from_numpy(wcos).cuda(), requires_grad=False)
    
def init_weights(scale=0.):
    wscale = 0.
    with torch.cuda.device(0):
        beta = Variable(wscale*torch.rand([regions*k,m]).cuda(), requires_grad=True)
        betaavg = Variable(beta.data.clone(), requires_grad=False)
    return (beta,), (betaavg,) # pytorch optimizer requires iterable 

def forward(x, beta):
    zx = conv1d(x[:,None,:], wsin_var, stride=stride).pow(2) \
       + conv1d(x[:,None,:], wcos_var, stride=stride).pow(2)
    return torch.mm(torch.log(zx + musicnet.epsilon).view(x.data.size()[0],regions*k), beta)

def L(y_hat, y, beta):
    # adjust for per-frame loss
    return mse_loss(y_hat, y)*128/2.# + .01*torch.mean(beta.pow(2.))

# Training the model

In [8]:
weights, averages = init_weights()
loss_history = []
avgp_history = []

In [9]:
avg = .9998
optimizer = torch.optim.SGD(weights, lr=0.000001, momentum=.95)

In [None]:
try:
    with train_set, test_set:
        print("squre loss\tabg prec\ttime\t\tutime")
        for epoch in range(50):
            t = time()
            for i, (x,y) in enumerate(train_loader):
                optimizer.zero_grad()
                
                # making x and y into pytorch dealable format
                x = Variable(x.cuda(), requires_grad=False) 
                y = Variable(y.cuda(), requires_grad=False) 
                
                loss = L(forward(x, *weights),y , *weights)
                loss.backward()
                optimizer.step()
                
                for w, wavg in zip(weights, averages):
                    wavg.data.copy_(avg*wavg.data + (1.-avg)*w.data) # what happens if I change to equal
                    
            t1 = time()
            avgp, loss = 0.,0.
            
            # For testing
            yground = torch.FloatTensor(batch_size*len(test_loader), m) # what not do this together with loss
            yhat = torch.FloatTensor(batch_size*len(test_loader), m)
            for i, (x_test,y_test) in enumerate(test_loader):
                x_test = Variable(x.cuda(), requires_grad=False)
                y_test = Variable(y.cuda(), requires_grad=False)
                yhatvar = forward(x, *averages)
                loss += L(yhatvar, y, *averages).item()
                yground[i*batch_size:(i+1)*batch_size,:] = y.data
                yhat[i*batch_size:(i+1)*batch_size,:] = yhatvar.data
            avgp = average_precision_score(yground.numpy().flatten(),yhat.numpy().flatten())
#             avgp = average_precision_score(yground.numpy().flatten(),yhat.numpy().flatten())
            loss_history.append(loss/len(test_loader))
            avgp_history.append(avgp)
            print('{:2f}\t{:2f}\t{:2f}\t{:2f}'.format(loss_history[-1],avgp_history[-1],time()-t, time()-t1))
            
            
except KeyboardInterrupt:
    print('Graceful Exit')
else:
    print("Finsihed")

squre loss	abg prec	time		utime
0.881687	0.641822	18.219045	7.240098
0.947431	0.607850	18.517441	7.089839
0.910845	0.657853	18.128100	7.147997
0.878232	0.602347	18.180477	7.028833
0.868673	0.670943	18.066533	6.969954
0.932125	0.594659	18.324343	7.207020
0.846076	0.665725	18.407685	7.229949
0.814040	0.734852	18.334870	7.024935
0.804640	0.669667	18.265310	7.299815
0.843653	0.655125	18.106795	7.149118
0.874697	0.653268	18.679201	7.381368
0.817805	0.635781	18.035022	7.158387
0.806221	0.724821	18.162704	7.126950
0.839534	0.674344	18.264164	7.081925
0.896651	0.679866	17.713976	6.918504
0.854479	0.620459	17.747329	7.001321
0.775531	0.673540	19.449620	7.832415
0.838393	0.675592	19.785830	7.556761
0.850690	0.682092	20.263006	7.915927
0.829349	0.633185	19.937139	7.794385
0.803107	0.669999	19.892531	7.866385
0.809511	0.702401	20.054228	7.988694
0.822052	0.697231	19.946571	7.833941
0.849095	0.648538	19.844320	7.636491
0.892880	0.658949	20.018417	7.810431
0.950502	0.646905	20.282392	7.900992
0.8897