In [47]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [48]:
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

sns.set_theme(style="white", palette=None)
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

In [49]:
import sys
sys.path.append('../src')
from data.audio_dataset import AudioSpectrogramDataset
from models.Generator3 import Generator3
from models.Discriminator3 import Discriminator3
from models.PGAN_model.PGenerator2 import PGenerator2
from models.PGAN_model.PGenerator import PGenerator

import torch
import pytorch_lightning as pl
import librosa
import soundfile as sf
from torch.utils.data import DataLoader

In [50]:
# # hyperparameters
# z_dim = 100
batch_size = 32
num_epochs = 50
latent_dim = 32
# learning_rate = 0.0002
# beta1 = 0.5

# base_directory = "../data/raw/NSynth/audio"
base_directory = "../data/raw/Bass"

dataset = AudioSpectrogramDataset(base_directory=base_directory, spectro_type='mel')
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [51]:
first_batch = next(iter(dataloader))
print(first_batch.size())

torch.Size([32, 1, 256, 160])


In [55]:
p_latent_dim = 100
dummy_latent_vector = torch.randn((1, p_latent_dim))
generator = PGenerator2()
generator.add_next_block(new_depth=256)
generator.add_next_block(new_depth=128)
generator.add_next_block(new_depth=64)
dummy_output = generator(dummy_latent_vector)
print(dummy_output.shape)

torch.Size([1, 10240])
torch.Size([1, 256, 8, 5])
block number: 0 with shape input: torch.Size([1, 256, 8, 5])
block number: 1 with shape input: torch.Size([1, 256, 16, 10])
block number: 2 with shape input: torch.Size([1, 128, 32, 20])
first block output before RGB shape: torch.Size([1, 64, 64, 40])
first block output after RGB shape: torch.Size([1, 1, 64, 40])
torch.Size([1, 1, 64, 40])


In [36]:
print(generator)

PGenerator2(
  (blocks): ModuleList(
    (0): ModuleList(
      (0): Sequential(
        (0): Upsample(scale_factor=2.0, mode=nearest)
        (1): ConvTranspose2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (2): LeakyReLU(negative_slope=0.2)
        (3): ConvTranspose2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (4): LeakyReLU(negative_slope=0.2)
      )
      (1): Sequential(
        (0): Conv2d(256, 1, kernel_size=(1, 1), stride=(1, 1))
        (1): Upsample(scale_factor=2.0, mode=nearest)
      )
    )
  )
  (normalizationLayer): NormalizationLayer()
  (l1): Sequential(
    (0): Linear(in_features=100, out_features=10240, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
  )
  (base_block): ModuleList(
    (0): Sequential(
      (0): ConvTranspose2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): LeakyReLU(negative_slope=0.2)
    )
    (1): Sequential(
      (0): Conv2d(256, 1, kernel_size=(1, 1), stride=(1,

In [7]:
dummy_latent_vector = torch.randn((1, latent_dim))
generator = Generator3(height=256, width=172, transformation_type='mel', latent_dim=latent_dim)
dummy_output = generator(dummy_latent_vector)
print(dummy_output.shape)

torch.Size([1, 256, 16, 10])
torch.Size([1, 128, 33, 21])
torch.Size([1, 64, 67, 43])
torch.Size([1, 32, 134, 86])
torch.Size([1, 16, 268, 172])
torch.Size([1, 1, 256, 173])
torch.Size([1, 1, 256, 173])


In [16]:
discriminator = Discriminator3(height=256, width=173, transformation_type='mel')
print(f'dummy shape: {dummy_output.shape}')
discriminator_output = discriminator(dummy_output)
print(discriminator_output.shape)

dummy shape: torch.Size([1, 1, 256, 173])
torch.Size([1, 16, 268, 172])
torch.Size([1, 32, 134, 86])
torch.Size([1, 64, 67, 43])
torch.Size([1, 128, 33, 21])
torch.Size([1, 256, 16, 10])
torch.Size([1, 40960])
torch.Size([1, 1])
torch.Size([1, 1])


In [28]:
import torch.nn as nn

class PGenerator(nn.Module):
    def __init__(self, h_init=8, w_init=5, latent_dim=100):
        super().__init__()

        self.h_init = h_init
        self.w_init = w_init
        self.latent_dim = latent_dim
        self.stride = 1
        self.kernel_size = 3
        self.padding = 1

        self.LeakyRelu = nn.LeakyReLU(0.2)
        self.scaleLayers = nn.ModuleList()
        self.toRGBLayers = nn.ModuleList()
    
        self.initFirstLinearLayer()

        self.l1 = nn.Sequential(nn.Linear(self.latent_dim, 256 * self.h_init * self.w_init))

        # Create a list to hold all our deconv layers
        self.deconv_blocks = nn.ModuleList()

        self.deconv_blocks.append(self._block(256, 256, kernel_size=3, stride = 1, padding = 1))

        # Final layer to produce the image
        self.deconv_blocks.append(
            nn.Sequential(
                nn.Conv2d(256, 1, kernel_size=(1,1)),
                nn.Tanh()
            )
        )

    def initFirstLinearLayer(self):
        pass

    def _block(self, in_channels, out_channels, **kwargs):
        return nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, **kwargs),
            nn.BatchNorm2d(out_channels, 0.8),
            nn.LeakyReLU(0.2, inplace=True)
        )

    def forward(self, z):
        out = self.l1(z)
        out = out.view(out.shape[0], 256, self.h_init, self.w_init)
        
        print(out.size())

        for block in self.deconv_blocks:
            out = block(out)
            print(out.size())

        return out


In [29]:
dummy_latent_vector = torch.randn((1, 100))
generator = PGenerator(8, 5, latent_dim=100)
dummy_output = generator(dummy_latent_vector)
print(dummy_output.shape)

torch.Size([1, 256, 8, 5])
torch.Size([1, 256, 8, 5])
torch.Size([1, 1, 8, 5])
torch.Size([1, 1, 8, 5])


In [30]:
import torch.nn as nn
import torch

class NormalizationLayer(nn.Module):

    def __init__(self):
        super(NormalizationLayer, self).__init__()

    def forward(self, x, epsilon=1e-8):
        return x / torch.sqrt(torch.mean(x ** 2, dim=1, keepdim=True) + 1e-8)


class PGenerator(nn.Module):
    def __init__(self, 
                 init_depth=256, 
                 init_size=(8, 5),
                 latent_dim=100,
                 output_depth=1,
                 init_scale=40,
                 LReLU_negative_slope=0.2,
                 toRGBActivation=None,
                 normalization=True):
        super().__init__()

        self.init_depth = init_depth # liczba kanalow przy najmniejszej rozdzielczosci
        self.output_depth = output_depth # to ile kanalow wyjsciowych
        self.init_scale = init_scale # 8x5 rozmiar spectrogramu na poczatku
        self.init_size = init_size
        self.latent_dim = latent_dim
        self.LReLU_negative_slope = LReLU_negative_slope # parametr LReLU
        self.kernel_size = 3
        self.padding = 1
        self.normalization = normalization
        self.toRGBActivation=toRGBActivation # funkcja aktywacji dla warsty toRGB, jesli None to uzyjemy identity

        self.depths = [init_depth]

        self.LeakyRelu = nn.LeakyReLU(negative_slope=self.LReLU_negative_slope)
        self.scaleLayers = nn.ModuleList()
        self.toRGBLayers = nn.ModuleList()

        self.normalizationLayer = None
        if normalization:
            self.normalizationLayer = NormalizationLayer()

        self.initFirstLinearLayer()
        self.initScale0Layer()

        self.alpha = 0

    def initFirstLinearLayer(self):
        self.l1 = nn.Linear(self.latent_dim, self.init_depth * self.init_size[0] * self.init_size[1])

    def initScale0Layer(self):
        self.groupScale0 = nn.ModuleList()

        self.groupScale0.append(
            nn.ConvTranspose2d(
                in_channels=self.depths[0],
                out_channels=self.depths[0],
                kernel_size=self.kernel_size,
                padding=self.padding
            )
        )

        self.toRGBLayers.append(
            nn.Conv2d(
                in_channels=self.depths[0],
                out_channels=self.output_depth,
                kernel_size=1,
            )
        )

    def getOutputSize(self):
        if type(self.init_size) == tuple:
            size_h = int(self.init_size[0] * (2**(len(self.toRGBLayers))))
            size_w = int(self.init_size[1] * (2**(len(self.toRGBLayers))))
            return (size_h, size_w)
        else:
            size = self.init_size * (2**(len(self.toRGBLayers)))
            return (size, size)

    def _block(self, in_channels, out_channels, **kwargs):
        return nn.Sequential(
            nn.ConvTranspose2d(in_channels, out_channels, **kwargs),
            nn.BatchNorm2d(out_channels, 0.8),
            nn.LeakyReLU(0.2, inplace=True)
        )

    def forward(self, z):
        out = self.l1(z)
        out = out.view(out.shape[0], self.init_depth, self.init_size[0], self.init_size[1])
        
        print(out.size())

        for block in self.deconv_blocks:
            out = block(out)
            print(out.size())

        return out


In [31]:
PGenerator = PGenerator()
output_size = PGenerator.getOutputSize()
print(output_size)

(16, 10)
