In [None]:
import numpy as np
import IPython.display as ipd
from librosa import load
from librosa.util import normalize
import os
import gc

import torch
from torch import nn
import torch.nn.functional as F

import math
import matplotlib.pyplot as plt

In [None]:
#@title Dataset
#Определяем класс, позволяющий получать примеры реальных композиций в нужном формате

SAMPLE_RATE = 16000
DURATION = 60
AUDIO_SHAPE = SAMPLE_RATE*DURATION

loops_simult = 5
loops_seq = 8

import random
import torch.nn.functional as F

class AudioDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path, dataset_size, segment_length, sampling_rate):
        self.sampling_rate = sampling_rate
        self.segment_length = segment_length
        self.datase_path = dataset_path
        self.audio_files = os.listdir(dataset_path)[:dataset_size]
        self.dataset_size = dataset_size
        random.seed(1234)
        random.shuffle(self.audio_files)

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = self.load_wav_to_torch(self.datase_path + filename)
        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start : audio_start + self.segment_length]
        else:
            audio = F.pad(
                audio, (0, self.segment_length - audio.size(0)), "constant"
            ).data

        # audio = audio / 32768.0
        return audio.unsqueeze(0)

    def __len__(self):
        return len(self.audio_files)

    def load_wav_to_torch(self, full_path):
        """
        Loads wavdata into torch array
        """
        data, sampling_rate = load(full_path, sr=self.sampling_rate)
        data = 0.95 * normalize(data)

        return torch.from_numpy(data).float(), sampling_rate

DATASET_PATH = '...' # Указать путь к папке, содержащей примеры реальной музыки

train_set = AudioDataset(DATASET_PATH, 128, AUDIO_SHAPE, SAMPLE_RATE)
val_set = AudioDataset(DATASET_PATH, 32, AUDIO_SHAPE, SAMPLE_RATE)

batch_size = 4
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size)

val_loader = torch.utils.data.DataLoader(val_set, batch_size=1)

In [None]:
# Сохраняем лупы в массив

LOOPS_PATH = '...' # Указать путь к папке, содержащей лупы

loops = []

loops_dict = dict()
for i, fname in enumerate(os.listdir(LOOPS_PATH)):
  loops_dict[i+1] = fname

num_loops = len(loops_dict)

for fname in os.listdir(LOOPS_PATH):
  audio, sr = load(LOOPS_PATH + fname)
  loops.append(audio)

In [None]:
#@title Loops2Audio function

# Определяем функцию, преобразующую матрицу лупов в аудио сигнал

def get_audio_from_loops(input):

  input = input.detach().numpy()

  batch_size = input.shape[0]

  songs = np.empty([batch_size, 1, AUDIO_SHAPE])

  for i, ids_to_play in enumerate(input):

    ids_to_play = np.reshape(ids_to_play, (loops_seq, loops_simult))
    song = np.array([])
    needed_size = AUDIO_SHAPE // len(ids_to_play)

    for ids_takt in ids_to_play:
      audio_takt = np.zeros(needed_size)

      for id in ids_takt:
        if id == 0:
          continue
        audio = loops[int(id)-1]

        actual_size = audio.shape[0]

        if actual_size == needed_size:
          audio_takt += audio

        elif actual_size > needed_size:
          audio_takt += audio[:needed_size]

        else:
          alpha = needed_size // actual_size
          audio_stacked = np.array([])
          for i in range(alpha):
            audio_stacked = np.hstack([audio_stacked, audio])
          audio_takt += audio_stacked

      song = np.hstack([song, np.array(audio_takt)])

    songs[i] = song

  return torch.from_numpy(songs).to(torch.float32)

In [None]:
#@title Utilites

from torch.nn.utils import weight_norm

# Инициализация весов для нейросетевых моделей
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(0.0, 0.02)
    elif classname.find("BatchNorm2d") != -1:
        m.weight.data.normal_(1.0, 0.02)
        m.bias.data.fill_(0)

def WNConv1d(*args, **kwargs):
    return weight_norm(nn.Conv1d(*args, **kwargs))


def WNConvTranspose1d(*args, **kwargs):
    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))

# Определяем архитектуру остаточного (ResNet) блока
class ResnetBlock(nn.Module):
    def __init__(self, dim, dilation=1):
        super().__init__()
        self.block = nn.Sequential(
            nn.LeakyReLU(0.2),
            nn.ReflectionPad1d(dilation),
            WNConv1d(dim, dim, kernel_size=3, dilation=dilation),
            nn.LeakyReLU(0.2),
            WNConv1d(dim, dim, kernel_size=1),
        )
        self.shortcut = WNConv1d(dim, dim, kernel_size=1)

    def forward(self, x):
        return self.shortcut(x) + self.block(x)

In [None]:
#@title Generator
# Определяем архитектуру генератора

nz = 100
ngf = 64
nc = loops_simult

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose1d(nz, ngf * 2, 2, 2),
            nn.BatchNorm1d(ngf * 2),
            nn.ReLU(True),

            nn.ConvTranspose1d(ngf * 2, ngf, 2, 2),
            nn.BatchNorm1d(ngf),
            nn.ReLU(True),

            nn.ConvTranspose1d(ngf, nc, 2, 2),
            nn.Tanh()
        )
        self.apply(weights_init)

    def forward(self, input):
        output = self.main(input)
        output = torch.sigmoid(output)*num_loops

        rounded = torch.round(output)

        rounded = output + (rounded - output).detach()

        return rounded

In [None]:
#@title Loops2Audio predictor
# Определяем архитектуру аппроксиматора
ngf2 = 4

class Loops2Audio(nn.Module):
  def __init__(self, input_size=nc, n_residual_layers=3):
        super().__init__()
        ratios = [8, 8, 5, 5, 5, 5, 3]
        self.hop_length = np.prod(ratios)
        mult = int(2 ** len(ratios))

        model = [
            nn.ReflectionPad1d(3),
            WNConv1d(input_size, mult * ngf2, kernel_size=7, padding=0),
        ]

        # Upsample to raw audio scale
        for i, r in enumerate(ratios):
            model += [
                nn.LeakyReLU(0.2),
                WNConvTranspose1d(
                    mult * ngf2,
                    mult * ngf2 // 2,
                    kernel_size=r * 2,
                    stride=r,
                    padding=r // 2 + r % 2,
                    output_padding=r % 2,
                ),
            ]

            for j in range(n_residual_layers):
                model += [ResnetBlock(mult * ngf2 // 2, dilation=3 ** j)]

            mult //= 2

        model += [
            nn.LeakyReLU(0.2),
            nn.ReflectionPad1d(3),
            WNConv1d(ngf2, 1, kernel_size=7, padding=0),
            nn.Tanh(),
        ]

        self.model = nn.Sequential(*model)
        self.apply(weights_init)

  def forward(self, x):
      return self.model(x)

In [None]:
#@title CNN Discriminator
# Определяем архитектуру дискриминатора

ndf = 16
n_layers = 4
downsampling_factor = 4

from torch.nn.utils import weight_norm

def WNConv1d(*args, **kwargs):
    return weight_norm(nn.Conv1d(*args, **kwargs))

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        model = nn.ModuleDict()

        model["layer_0"] = nn.Sequential(
            nn.ReflectionPad1d(7),
            WNConv1d(1, ndf, kernel_size=15),
            nn.LeakyReLU(0.2, True),
        )

        nf = ndf
        stride = downsampling_factor
        for n in range(1, n_layers + 1):
            nf_prev = nf
            nf = min(nf * stride, 1024)

            model["layer_%d" % n] = nn.Sequential(
                WNConv1d(
                    nf_prev,
                    nf,
                    kernel_size=stride * 10 + 1,
                    stride=stride,
                    padding=stride * 5,
                    groups=nf_prev // 4,
                ),
                nn.LeakyReLU(0.2, True),
            )

        nf = min(nf * 2, 1024)
        model["layer_%d" % (n_layers + 1)] = nn.Sequential(
            WNConv1d(nf_prev, nf, kernel_size=5, stride=1, padding=2),
            nn.LeakyReLU(0.2, True),
        )

        model["layer_%d" % (n_layers + 2)] = WNConv1d(
            nf, 1, kernel_size=3, stride=1, padding=1
        )

        self.model = model
        self.apply(weights_init)

    def forward(self, x):
        results = []
        for key, layer in self.model.items():
            x = layer(x)
            results.append(x)
        return results

In [None]:
netG = Generator().cuda()
netL2A = Loops2Audio().cuda()
netD = Discriminator().cuda()

optG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999))
optL2A = torch.optim.Adam(netL2A.parameters(), lr=0.0002, betas=(0.5, 0.999))
optD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999))

gan_loss = nn.BCELoss()
loss_fn = F.mse_loss



In [None]:
#@title Training Loops2Audio model
# Обучение аппроксиматора

l2a_errs = []
l2a_epochs = 1000

for epoch in range(1, l2a_epochs + 1):

    rand_ids = torch.randint(low=0, high=27, size=(5, 8)).unsqueeze(dim=0).cuda()

    audio_pred = netL2A(rand_ids.to(torch.float32))

    audio_g = get_audio_from_loops(rand_ids.cpu()).cuda()

    l2a_error = loss_fn(audio_pred, audio_g, reduction='sum')

    netL2A.zero_grad()
    l2a_error.backward()
    optL2A.step()

    del audio_pred
    del audio_g
    gc.collect()
    torch.cuda.empty_cache()

    l2a_errs.append(l2a_error.item())

    if epoch % 10 == 0:
      print("epoch num {} loss_L2A = {}".format(epoch, l2a_error.item()))

In [None]:
#@title Training
# Обучение GAN

epochs = 5

train_losses_D = []
train_losses_G = []
val_losses_D = []
val_losses_G = []

torch.backends.cudnn.benchmark = True

for epoch in range(1, epochs + 1):

    for iterno, audio_r in enumerate(train_loader):

        audio_r = audio_r.cuda().to(torch.float32)

        noise = torch.randn(batch_size, nz, 1).cuda()
        ids = netG(noise)

        #######################
        # Train Discriminator #
        #######################
        D_fake_det = netD(audio_pred.cuda().detach())
        D_real = netD(audio_r.cuda())

        loss_D = 0
        for scale in D_fake_det:
            loss_D += F.relu(1 + scale[-1]).mean()

        for scale in D_real:
            loss_D += F.relu(1 - scale[-1]).mean()

        netD.zero_grad()
        loss_D.backward()
        optD.step()

        ###################
        # Train Generator #
        ###################
        audio_pred = netL2A(ids)
        D_fake = netD(audio_pred.cuda())

        loss_G = 0
        for scale in D_fake:
            loss_G += -scale[-1].mean()

        loss_feat = 0
        feat_weights = 4.0 / (n_layers + 1)
        D_weights = 1.0
        wt = D_weights * feat_weights
        for i in range(1):
            for j in range(len(D_fake[i]) - 1):
                loss_feat += wt * F.l1_loss(D_fake[i][j], D_real[i][j].detach())

        netG.zero_grad()
        (loss_G + 10 * loss_feat).backward()
        optG.step()

        del audio_r
        del audio_pred
        del audio_g
        del noise
        del D_fake_det
        del D_real
        gc.collect()
        torch.cuda.empty_cache()

        print("epoch num {} loss_G = {}, loss_D = {}".format(epoch, loss_G, loss_D))

        train_losses_G.append(loss_G)
        train_losses_D.append(loss_D)

In [None]:
# Сохраняем параметры обученного генератора

torch.save(netG.state_dict(), '...') # Указать путь к .pth файлу, в который требуется сохранить параметры генератора

In [None]:
# Загружаем параметры предобученного генератора

netG = Generator().cuda()
netG.load_state_dict(torch.load('...')) # Указать путь к .pth файлу, содержащему параметры предобученного генератора

In [None]:
# Генерируем музыку

noise = torch.randn(1, nz, 1).cuda()
ids = netG(noise)
audio_g = get_audio_from_loops(ids.cpu()).cuda()
audio_pred = netL2A(ids.detach()).detach()
ipd.Audio(audio_g.squeeze().cpu(), rate=SAMPLE_RATE)