In [None]:
!wget https://storage.googleapis.com/download.magenta.tensorflow.org/datasets/nsynth/nsynth-train.jsonwav.tar.gz
!tar -xvf nsynth-train.jsonwav.tar.gz

In [2]:
import torch
import json
import os
import numpy as np
from torch.utils.data import Dataset
from torch import Tensor
from torchaudio import load
import librosa
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [4]:
data_path='./nsynth-train/audio'
json_path='./nsynth-train/examples.json'
batch_size=32

In [5]:
f = open(json_path, "r")
targets = json.load(f)

In [6]:
def get_mel_spectrogram(waveform, sample_rate):
    mel_spectrogram = librosa.feature.melspectrogram(y=waveform.numpy(), sr=sample_rate)
    mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    mel_spectrogram = torch.from_numpy(mel_spectrogram).unsqueeze(0)
    return mel_spectrogram.clone().detach()

In [7]:
class NSynthDataset(Dataset):
    def __init__(self):
        self.data_path = data_path
        self.file_list = os.listdir(self.data_path)

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file = self.file_list[idx]
        file_name = file.split('.')[0]
        pitch = targets[file_name]['pitch']
        waveform, sample_rate = load(os.path.join(self.data_path, file))
        mel_spec = get_mel_spectrogram(waveform, sample_rate)[0][0]
        return mel_spec, pitch

In [8]:
nsynth_dataset = NSynthDataset()

In [9]:
train_loader = torch.utils.data.DataLoader(
    nsynth_dataset, batch_size=batch_size, shuffle=False, num_workers=2
)

In [10]:
x_batch, y_batch = next(iter(train_loader))
x_batch.reshape(x_batch.shape[0], -1).shape, y_batch.shape

(torch.Size([32, 16128]), torch.Size([32]))

In [16]:
features = 16128
hidden = 200
classes = 128

In [None]:
w1 = (torch.FloatTensor(features, hidden).uniform_(-1, 1) / features**0.5).to(device)
w2 = (torch.FloatTensor(hidden, hidden).uniform_(-1, 1) / features**0.5).to(device)
w3 = (torch.FloatTensor(hidden, classes).uniform_(-1, 1) / features**0.5).to(device)
w1.requires_grad_(True)
w2.requires_grad_(True)
w3.requires_grad_(True)

In [18]:
epochs = 20
lr=0.01
history=[]

In [19]:
from torch.nn.functional import cross_entropy

In [None]:
for i in range(epochs):
  for x_batch, y_batch in train_loader:
    x_batch = x_batch.reshape(x_batch.shape[0], -1).to(device)
    y_batch = y_batch.to(device)
    hidden1 = x_batch @ w1
    hidden2 = hidden1 @ w2
    logits = hidden2 @ w3
    probabilities = torch.exp(logits) / torch.exp(logits).sum(dim=1, keepdims=True)
    loss = -torch.log(probabilities[range(batch_size), y_batch]).mean()
    history.append(loss.item())
    loss.backward()
    grad1 = w1.grad
    grad2 = w2.grad
    grad3 = w3.grad
    with torch.no_grad():
      w1 -= lr * grad1
      w2 -= lr * grad2
      w3 -= lr * grad3
    w1.grad.zero_()
    w2.grad.zero_()
    w3.grad.zero_()
  print(f'{i+1}: loss {history[-1]}')

In [None]:
plt.figure(figsize=(30, 7))

plt.plot(history)

plt.title('Loss by batch iterations')
plt.ylabel('Entropy Loss')
plt.xlabel('batches')

plt.show()

In [None]:
from sklearn.metrics import accuracy_score
acc = 0
batches = 0

for x_batch, y_batch in test_loader:
  # загружаем батч данных (вытянутый в линию)
  batches += 1
  x_batch = x_batch.view(x_batch.shape[0], -1).to(device)
  y_batch = y_batch.to(device)

  preds = torch.argmax((((x_batch @ w1) @ w2) @ w3), dim=1).to(device)
  acc += (preds==y_batch).cpu().numpy().mean()

print(f'Test accuracy {acc / batches:.3}')