#Install libraries

In [1]:
!pip install -q torchinfo
!pip install einops
!pip install datasets

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m633.5 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.3

In [2]:
from datasets import load_dataset

In [3]:
import librosa

In [4]:
import soundfile as sf

In [5]:
from torch.utils.data import Dataset, DataLoader

In [6]:
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchinfo import summary
from einops import rearrange, repeat
import os
import pickle
import random

In [7]:
import numpy as np
import matplotlib.pyplot as plt

In [8]:
import pickle
import librosa

In [65]:
from IPython.display import Audio

#Audio files pre-processing

In [None]:
!unzip /content/drive/MyDrive/Data_AudioGen/recordings.zip -d /content/Audios

Archive:  /content/drive/MyDrive/Data_AudioGen/recordings.zip
   creating: /content/Audios/recordings/
  inflating: /content/Audios/recordings/0_george_10.wav  
  inflating: /content/Audios/recordings/0_george_14.wav  
  inflating: /content/Audios/recordings/0_george_20.wav  
  inflating: /content/Audios/recordings/0_george_22.wav  
  inflating: /content/Audios/recordings/0_george_28.wav  
  inflating: /content/Audios/recordings/0_george_30.wav  
  inflating: /content/Audios/recordings/0_george_31.wav  
  inflating: /content/Audios/recordings/0_george_32.wav  
  inflating: /content/Audios/recordings/0_george_34.wav  
  inflating: /content/Audios/recordings/0_george_38.wav  
  inflating: /content/Audios/recordings/0_george_39.wav  
  inflating: /content/Audios/recordings/0_george_42.wav  
  inflating: /content/Audios/recordings/0_george_45.wav  
  inflating: /content/Audios/recordings/0_george_49.wav  
  inflating: /content/Audios/recordings/0_george_9.wav  
  inflating: /content/Audios

In [None]:
"""
1- load a file
2- pad the signal (if necessary)
3- extracting log spectrogram from signal
4- normalise spectrogram
5- save the normalised spectrogram

PreprocessingPipeline
"""


class Loader:
    """Loader is responsible for loading an audio file."""

    def __init__(self, sample_rate, duration, mono):
        self.sample_rate = sample_rate
        self.duration = duration
        self.mono = mono

    def load(self, file_path):
        signal = librosa.load(file_path,
                              sr=self.sample_rate,
                              duration=self.duration,
                              mono=self.mono)[0]
        return signal


class Padder:
    """Padder is responsible to apply padding to an array."""

    def __init__(self, mode="constant"):
        self.mode = mode

    def left_pad(self, array, num_missing_items):
        padded_array = np.pad(array,
                              (num_missing_items, 0),
                              mode=self.mode)
        return padded_array

    def right_pad(self, array, num_missing_items):
        padded_array = np.pad(array,
                              (0, num_missing_items),
                              mode=self.mode)
        return padded_array


class LogSpectrogramExtractor:
    """LogSpectrogramExtractor extracts log spectrograms (in dB) from a
    time-series signal.
    """

    def __init__(self, frame_size, hop_length):
        self.frame_size = frame_size
        self.hop_length = hop_length

    def extract(self, signal):
        stft = librosa.stft(signal,
                            n_fft=self.frame_size,
                            hop_length=self.hop_length)[:-1]
        spectrogram = np.abs(stft)
        log_spectrogram = librosa.amplitude_to_db(spectrogram)
        return log_spectrogram


class MinMaxNormaliser:
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())
        norm_array = norm_array * (self.max - self.min) + self.min
        return norm_array

    def denormalise(self, norm_array, original_min, original_max):
        array = (norm_array - self.min) / (self.max - self.min)
        array = array * (original_max - original_min) + original_min
        return array


class Saver:
    """saver is responsible to save features, and the min max values."""

    def __init__(self, min_max_values_save_dir): #feature_save_dir, min_max_values_save_dir):
        #self.feature_save_dir = feature_save_dir
        self.min_max_values_save_dir = min_max_values_save_dir

    def save_feature(self, feature, file_path):
        save_path = self._generate_save_path(file_path)
        np.save(save_path, feature)

    def save_min_max_values(self, min_max_values):
        save_path = os.path.join(self.min_max_values_save_dir,"min_max_values.pkl")
        self._save(min_max_values, save_path)

    @staticmethod
    def _save(data, save_path):
        with open(save_path, "wb") as f:
            pickle.dump(data, f)

    def _generate_save_path(self, file_path):
        file_name = os.path.split(file_path)[1]
        save_path = os.path.join(self.feature_save_dir, file_name + ".npy")
        return save_path


class PreprocessingPipeline:
    """PreprocessingPipeline processes audio files in a directory, applying
    the following steps to each file:
        1- load a file
        2- pad the signal (if necessary)
        3- extracting log spectrogram from signal
        4- normalise spectrogram
        5- save the normalised spectrogram

    Storing the min max values for all the log spectrograms.
    """

    def __init__(self):
        self.padder = None
        self.extractor = None
        self.normaliser = None
        self.saver = None
        self.min_max_values = {}
        self._loader = None
        self._num_expected_samples = None

    @property
    def loader(self):
        return self._loader

    @loader.setter
    def loader(self, loader):
        self._loader = loader
        self._num_expected_samples = int(loader.sample_rate * loader.duration)

    def process(self, audio_files_dir):
        for root, _, files in os.walk(audio_files_dir):
            for filep in files:
                file_path = os.path.join(root, filep)
                self._process_file(file_path, filep)
                print(f"Processed file {filep}")
        self.saver.save_min_max_values(self.min_max_values)

    def _process_file(self, file_path, filep):
        signal = self.loader.load(file_path)
        if self._is_padding_necessary(signal):
            signal = self._apply_padding(signal)
        feature = self.extractor.extract(signal)
        #norm_feature = self.normaliser.normalise(feature)
        #save_path = self.saver.save_feature(norm_feature, file_path)
        self._store_min_max_value(filep, feature.min(), feature.max())

    def _is_padding_necessary(self, signal):
        if len(signal) < self._num_expected_samples:
            return True
        return False

    def _apply_padding(self, signal):
        num_missing_samples = self._num_expected_samples - len(signal)
        padded_signal = self.padder.right_pad(signal, num_missing_samples)
        return padded_signal

    def _store_min_max_value(self, filep, min_val, max_val):
      save_path = '/content/drive/MyDrive/Data_AudioGen/fsdd/spectrograms/' + filep + '.npy'
      self.min_max_values[save_path] = {
          "min": min_val,
          "max": max_val
      }

if __name__ == "__main__":
    FRAME_SIZE = 512
    HOP_LENGTH = 256
    DURATION = 0.74  # in seconds
    SAMPLE_RATE = 22050
    MONO = True

    MIN_MAX_VALUES_SAVE_DIR = "/content/drive/MyDrive/Data_AudioGen/fsdd"
    #SPECTROGRAMS_SAVE_DIR = "/content/drive/MyDrive/Data_AudioGen/fsdd/spectrograms/"
    FILES_DIR = "/content/Audios"

    # instantiate all objects
    loader = Loader(SAMPLE_RATE, DURATION, MONO)
    padder = Padder()
    log_spectrogram_extractor = LogSpectrogramExtractor(FRAME_SIZE, HOP_LENGTH)
    min_max_normaliser = MinMaxNormaliser(0, 1)
    saver = Saver(MIN_MAX_VALUES_SAVE_DIR)#SPECTROGRAMS_SAVE_DIR, MIN_MAX_VALUES_SAVE_DIR)

    preprocessing_pipeline = PreprocessingPipeline()
    preprocessing_pipeline.loader = loader
    preprocessing_pipeline.padder = padder
    preprocessing_pipeline.extractor = log_spectrogram_extractor
    preprocessing_pipeline.normaliser = min_max_normaliser
    preprocessing_pipeline.saver = saver

    preprocessing_pipeline.process(FILES_DIR)


Processed file 3_jackson_30.wav
Processed file 4_theo_37.wav
Processed file 1_lucas_14.wav
Processed file 5_nicolas_8.wav
Processed file 5_theo_32.wav
Processed file 5_george_28.wav
Processed file 6_theo_31.wav
Processed file 8_george_9.wav
Processed file 6_yweweler_12.wav
Processed file 4_theo_33.wav
Processed file 1_george_48.wav
Processed file 2_theo_19.wav
Processed file 1_nicolas_1.wav
Processed file 9_jackson_14.wav
Processed file 4_nicolas_26.wav
Processed file 1_theo_1.wav
Processed file 7_george_4.wav
Processed file 6_nicolas_14.wav
Processed file 1_lucas_30.wav
Processed file 0_jackson_44.wav
Processed file 4_lucas_0.wav
Processed file 9_jackson_6.wav
Processed file 5_nicolas_44.wav
Processed file 5_lucas_2.wav
Processed file 6_jackson_47.wav
Processed file 7_george_9.wav
Processed file 0_theo_13.wav
Processed file 2_yweweler_0.wav
Processed file 8_lucas_27.wav
Processed file 0_yweweler_7.wav
Processed file 9_theo_18.wav
Processed file 7_jackson_39.wav
Processed file 1_ywewel

#Código

##Data

In [None]:
def load_fsdd(spectrograms_path):
    x_train = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path) # (n_bins, n_frames, 1)
            x_train.append(spectrogram)
    x_train = np.array(x_train)
    x_train = x_train[:, np.newaxis, :, :] # -> (3000, 256, 64, 1)
    return x_train

In [None]:
x_train = load_fsdd("/content/drive/MyDrive/Data_AudioGen/fsdd/spectrograms")

In [None]:
x_train = np.load("/content/drive/MyDrive/Data_AudioGen/fsdd/x_train_spectrograms.npy")

In [None]:
x_train.shape # (3000, 256, 64, 1)

(3000, 1, 256, 64)

In [None]:
np.save("/content/drive/MyDrive/Data_AudioGen/fsdd/x_train_spectrograms.npy",x_train)

In [None]:
class ImageDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image = self.data[idx]['image']
        label = self.data[idx]['label']
        if self.transform:
            image = self.transform(image)
        return image, label

# Define a transform to convert PIL images to tensors
stats = ((0.1307),(0.3081))
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(*stats,inplace=True)  # Convert PIL Image to PyTorch Tensor
])

# Update the datasets to use the transform
train_dataset = ImageDataset(ds['train'], transform=transform)
valid_dataset = ImageDataset(ds['test'], transform=transform)

In [None]:
Batch_size = 64

In [None]:
train_dl = DataLoader(x_train, batch_size=Batch_size, shuffle=True, num_workers=3, pin_memory=True)
valid_dl = DataLoader(x_train, batch_size=Batch_size, shuffle=False, num_workers=3, pin_memory=True)



##Check device

In [9]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

In [10]:
device = get_default_device()
device

device(type='cpu')

In [None]:
device = get_default_device()
torch.cuda.empty_cache()
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

##Base

In [11]:
def funcion_perdida(x, xrec, mu, logvar, alpha = 1000000):
  ecm = F.mse_loss(x, xrec, reduction='sum')
  kl = -0.5 * torch.sum(1 + logvar - mu**2 - logvar.exp())
  perdida = ecm + kl
  return perdida * alpha

In [12]:
class Base(nn.Module):

    def training_step(self, images):
        out, mu, log_variance = self(images)
        loss = funcion_perdida(images, out, mu, log_variance)
        return loss

    def validation_step(self, images):
        out, mu, log_variance = self(images)
        loss = funcion_perdida(images, out, mu, log_variance)
        return {'val_loss': loss.detach()}

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {'val_loss': epoch_loss.item()}

    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}".format(epoch, result['train_loss'], result['val_loss']))

##Model

In [13]:
class Encoder(nn.Module):
  def __init__(self,
                 input_shape,
                 conv_filters,
                 conv_kernels,
                 conv_strides,
                 latent_space_dim,
                 d_model):
    super(Encoder, self).__init__()
    self.layers = nn.ModuleList()

    in_channel = input_shape
    for out_channels, kernel_size, stride in zip(conv_filters,  conv_kernels, conv_strides):
        self.layers.append(nn.Sequential(nn.Conv2d(in_channel, out_channels, kernel_size, stride=stride, padding=1),
                                         nn.ReLU(),
                                         nn.BatchNorm2d(out_channels)))
        in_channel = out_channels

    self.bottleneck = nn.Flatten()
    self.mu = nn.Linear(d_model, latent_space_dim)
    self.log_variance = nn.Linear(d_model, latent_space_dim)

  def forward(self, x):
        for conv_layer in self.layers:
            x = conv_layer(x)
        x = self.bottleneck(x)
        return self.mu(x), self.log_variance(x)


class Decoder(nn.Module):
  def __init__(self,
                conv_filters,
                conv_kernels,
                conv_strides,
                latent_space_dim,
                d_model):
    super(Decoder, self).__init__()

    self.return_layer = nn.Linear(latent_space_dim, d_model)

    self.layers = nn.ModuleList()
    num_conv_layers = len(conv_filters)
    in_channels = conv_filters[num_conv_layers-1]

    for index in reversed(range(1, num_conv_layers)):
        index_output = index - 1
        out = 1
        if index == 4:
          out = (1,0)
        self.layers.append(nn.Sequential(nn.ConvTranspose2d(conv_filters[index], conv_filters[index_output], conv_kernels[index], stride=conv_strides[index], padding=1, output_padding=out),
                                         nn.ReLU(),
                                         nn.BatchNorm2d(conv_filters[index_output])))

    self.output_layer = nn.Sequential(nn.ConvTranspose2d(conv_filters[0], 1, conv_kernels[0], stride=conv_strides[0], padding=1, output_padding=1),
                                      nn.Sigmoid())

  def forward(self, x):
      x = self.return_layer(x)
      x = x.view(-1, 32, 8, 4)
      for convT_layer in self.layers:
          x = convT_layer(x)
      x = self.output_layer(x)
      return x

class VAE(Base):
  def __init__(self,
                input_shape,
                conv_filters,
                conv_kernels,
                conv_strides,
                latent_space_dim,
                d_model):
    super(VAE, self).__init__()
    self.encoder_step = Encoder(input_shape, conv_filters, conv_kernels, conv_strides, latent_space_dim, d_model)
    self.decoder_step = Decoder(conv_filters, conv_kernels, conv_strides, latent_space_dim, d_model)

  def proyectar(self, x):
    mu, logvar = self.encoder_step(x)
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    z = mu + eps * std
    return z

  def forward(self, x):
    mu, logvar = self.encoder_step(x)
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    z = mu + eps * std
    xrec = self.decoder_step(z)
    return xrec, z

In [14]:
in_chanel= 1
out_channels = (512, 256, 128, 64, 32)
Kernels = (3, 3, 3, 3, 3)
strides = (2, 2, 2, 2, (2,1))
letent_space = 128
d_model = 1024 # 7*7*64
model_ac = to_device(VAE(in_chanel, out_channels, Kernels, strides, letent_space, d_model), device)

##Training

In [None]:
@torch.no_grad()
def evaluate(model_ac, val_loader):
    model_ac.eval()
    outputs = [model_ac.validation_step(batch) for batch in val_loader]
    return model_ac.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
#metrics=[self._calculate_reconstruction_loss, self._calculate_kl_loss]
def try_model(epochs, lr, model, train_loader, val_loader, weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []

    optimizer = opt_func(model.parameters(), lr, weight_decay=weight_decay)

    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, lr, epochs=epochs, steps_per_epoch=len(train_loader))

    for epoch in range(epochs):

        model.train()
        train_losses = []
        outlist = []
        batchlist = []
        lrs = []

        for batch in train_loader:

            optimizer.zero_grad()
            loss = model.training_step(batch)
            train_losses.append(loss)

            loss.backward()

            if grad_clip:
                torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)

            optimizer.step()

            lrs.append(get_lr(optimizer))
            sched.step()

        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [None]:
#epochs = 100
#max_lr = 0.001
#grad_clip = 0.0001
#weight_decay = 1e-4
#opt_func = torch.optim.Adam

epochs = 150
max_lr = 0.01
grad_clip = 0.0001
weight_decay = 1e-4
opt_func = torch.optim.Adam

In [None]:
history = [evaluate(model_ac, train_dl)]
history

  return F.conv_transpose2d(


[{'val_loss': 240865705984.0}]

In [None]:
history += try_model(epochs, max_lr, model_ac, train_dl, valid_dl,weight_decay, grad_clip,
                             opt_func=opt_func)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [0], train_loss: 66046369792.0000, val_loss: 24269238272.0000
Epoch [1], train_loss: 12746536960.0000, val_loss: 11839048704.0000
Epoch [2], train_loss: 9165429760.0000, val_loss: 9074173952.0000
Epoch [3], train_loss: 8429403136.0000, val_loss: 7821793792.0000
Epoch [4], train_loss: 7814683648.0000, val_loss: 7784536576.0000
Epoch [5], train_loss: 7669081600.0000, val_loss: 7440054272.0000
Epoch [6], train_loss: 7470108160.0000, val_loss: 7399279104.0000
Epoch [7], train_loss: 7346086912.0000, val_loss: 6845946368.0000
Epoch [8], train_loss: 7770352128.0000, val_loss: 7603406848.0000
Epoch [9], train_loss: 7534089216.0000, val_loss: 8174123520.0000
Epoch [10], train_loss: 6687391744.0000, val_loss: 6533010432.0000
Epoch [11], train_loss: 6614190592.0000, val_loss: 6404699136.0000
Epoch [12], train_loss: 6505620992.0000, val_loss: 6739302912.0000
Epoch [13], train_loss: 6573998080.0000, val_loss: 7191192576.0000
Epoch [14], train_loss: 6321093632.0000, val_loss: 6195986432.0000
E

In [None]:
save_path = "/content/drive/MyDrive/Data_AudioGen/weightsVAE_spectrogram.pth"
torch.save(model_ac.state_dict(), save_path)

##Analysis

In [170]:
save_path = "/content/drive/MyDrive/Data_AudioGen/weightsVAE_spectrogram.pth"
weights = torch.load(save_path,map_location=torch.device('cpu'))

In [171]:
class MinMaxNormaliser:
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())
        norm_array = norm_array * (self.max - self.min) + self.min
        return norm_array

    def denormalise(self, norm_array, original_min, original_max):
        array = (norm_array - self.min) / (self.max - self.min)
        array = array * (original_max - original_min) + original_min
        return array

In [172]:
class SoundGenerator:

    def __init__(self, vae, hop_length):
        self.vae = vae
        self.hop_length = hop_length
        self._min_max_normaliser = MinMaxNormaliser(0, 1)

    def generate(self, spectrograms, min_max_values):
        generated_spectrograms, latent_representations = self.vae(spectrograms)
        #generated_spectrograms = generated_spectrograms.numpy()
        signals = self.convert_spectrograms_to_audio(generated_spectrograms, min_max_values)
        return signals, latent_representations

    def convert_spectrograms_to_audio(self, spectrograms, min_max_values):
        signals = []
        for spectrogram, min_max_value in zip(spectrograms, min_max_values):
            log_spectrogram = spectrogram.numpy().reshape([256,64])
            denorm_log_spec = self._min_max_normaliser.denormalise(log_spectrogram, min_max_value["min"], min_max_value["max"])
            spec = librosa.db_to_amplitude(denorm_log_spec)
            signal = librosa.istft(spec, hop_length=self.hop_length)
            signals.append(signal)
        return signals

In [173]:
HOP_LENGTH = 256
SAVE_DIR_ORIGINAL = "/content/drive/MyDrive/Data_AudioGen/original"
SAVE_DIR_GENERATED = "/content/drive/MyDrive/Data_AudioGen/generated"
MIN_MAX_VALUES_PATH = "/content/drive/MyDrive/Data_AudioGen/fsdd/min_max_values.pkl"
SPECTROGRAMS_PATH = "/content/drive/MyDrive/Data_AudioGen/fsdd/spectrograms"

def load_fsdd(spectrograms_path):
    x_train = []
    file_paths = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path) # (n_bins, n_frames, 1)
            x_train.append(spectrogram)
            file_paths.append(file_path)
    x_train = np.array(x_train)
    x_train = x_train[:, np.newaxis, :, :] # -> (3000, 256, 64, 1)
    return x_train, file_paths


def select_spectrograms(spectrograms,
                        file_paths,
                        min_max_values,
                        num_spectrograms=2):
    sampled_indexes = np.random.choice(range(len(spectrograms)), num_spectrograms)
    sampled_spectrogrmas = spectrograms[sampled_indexes]
    file_paths = [file_paths[index] for index in sampled_indexes]
    sampled_min_max_values = [min_max_values[file_path] for file_path in file_paths]
    return sampled_spectrogrmas, sampled_min_max_values


def save_signals(signals, save_dir, sample_rate=22050):
    for i, signal in enumerate(signals):
        save_path = os.path.join(save_dir, str(i) + ".wav")
        sf.write(save_path, signal, sample_rate)


In [174]:
model_ac.load_state_dict(weights)

<All keys matched successfully>

##Generating audios

In [175]:
sound_generator = SoundGenerator(model_ac, HOP_LENGTH)

In [176]:
with open(MIN_MAX_VALUES_PATH, "rb") as f:
        min_max_values = pickle.load(f)
specs, file_paths = load_fsdd(SPECTROGRAMS_PATH)
sampled_specs, sampled_min_max_values = select_spectrograms(specs, file_paths, min_max_values, 5)
sampled_specs = DataLoader(sampled_specs, batch_size=5, shuffle=True, num_workers=3, pin_memory=True)

with torch.no_grad():
  signals, latent_space = sound_generator.generate(next(iter(sampled_specs)), sampled_min_max_values)
  original_signals = sound_generator.convert_spectrograms_to_audio(next(iter(sampled_specs)), sampled_min_max_values)
save_signals(signals, SAVE_DIR_GENERATED)
save_signals(original_signals, SAVE_DIR_ORIGINAL)

In [177]:
print("Original: sample 0")
Audio(original_signals[0], rate=22100)

Original: sample 0


In [178]:
print("Generated: sample 0")
Audio(signals[0], rate=22100)

Generated: sample 0


In [179]:
print("Original: sample 1")
Audio(original_signals[1], rate=22100)

Original: sample 1


In [180]:
print("Generated: sample 1")
Audio(signals[1], rate=22100)

Generated: sample 1


In [181]:
print("Original: sample 2")
Audio(original_signals[2], rate=22100)

Original: sample 2


In [182]:
print("Generated: sample 2")
Audio(signals[2], rate=22100)

Generated: sample 2


In [183]:
print("Original: sample 3")
Audio(original_signals[3], rate=22100)

Original: sample 3


In [184]:
print("Generated: sample 3")
Audio(signals[3], rate=22100)

Generated: sample 3


In [185]:
print("Original: sample 4")
Audio(original_signals[4], rate=22100)

Original: sample 4


In [186]:
print("Generated: sample 4")
Audio(signals[4], rate=22100)

Generated: sample 4


##Using the latent space for the generation of samples

In [187]:
with torch.no_grad():
  _, latent_space = sound_generator.generate(next(iter(sampled_specs)), sampled_min_max_values)

In [188]:
original_signals = sound_generator.convert_spectrograms_to_audio(next(iter(sampled_specs)), sampled_min_max_values)

In [189]:
def generate_random_numbers(min_value, max_value, std_value, count):
    mean_value = (min_value + max_value) / 2
    random_numbers = np.random.normal(loc=mean_value, scale=std_value, size=count)
    clipped_numbers = np.clip(random_numbers, min_value, max_value)
    return clipped_numbers

random_latent = []
for i in range(5):
  min_value = latent_space[i].min()
  max_value = latent_space[i].max()
  std_value = latent_space[i].std()
  count = 128
  random_numbers = generate_random_numbers(min_value, max_value, std_value, count)
  random_latent.append(random_numbers.to(torch.float32))

In [190]:
random_latent = torch.stack(random_latent)

In [191]:
with torch.no_grad():
  signals_random = model_ac.decoder_step(random_latent)

In [192]:
signals_rand = sound_generator.convert_spectrograms_to_audio(signals_random, sampled_min_max_values)

In [193]:
print("Original: sample 0")
Audio(original_signals[0], rate=22100)

Original: sample 0


In [194]:
print("Generated: sample 0")
Audio(signals_rand[0], rate=22100)

Generated: sample 0


In [195]:
print("Original: sample 1")
Audio(original_signals[1], rate=22100)

Original: sample 1


In [196]:
print("Generated: sample 1")
Audio(signals_rand[1], rate=22100)

Generated: sample 1


In [197]:
print("Original: sample 2")
Audio(original_signals[2], rate=22100)

Original: sample 2


In [198]:
print("Generated: sample 2")
Audio(signals_rand[2], rate=22100)

Generated: sample 2


In [199]:
print("Original: sample 3")
Audio(original_signals[3], rate=22100)

Original: sample 3


In [200]:
print("Generated: sample 3")
Audio(signals_rand[3], rate=22100)

Generated: sample 3


In [201]:
print("Original: sample 4")
Audio(original_signals[4], rate=22100)

Original: sample 4


In [202]:
print("Generated: sample 4")
Audio(signals_rand[4], rate=22100)

Generated: sample 4
