In [1]:
import auraloss
import collections
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import plotly.graph_objects as go
import pretty_midi
import pytorch_lightning as pl
import pywt
import random
import scipy.signal
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchaudio
from torch.utils.data import DataLoader, Dataset
import wandb
from pytorch_lightning.loggers import WandbLogger
from tqdm import tqdm

print(torch.cuda.is_available())


True


# Set Seeds

In [2]:
seed_value = 3407
torch.manual_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.set_float32_matmul_precision('high')

# caching audio

1) load audio from hard drive
2) reduce sample rate from 44100 to 32000 (this is primarily to save ram)
3) store in data structure

In [3]:
path = "../data/rendered_audio/rendered_audio/"
files = os.listdir(path)
all_scenes = {}
counter = 0
transform = torchaudio.transforms.Resample(44100, 32000)

for file in tqdm(files):
    if ".flac" in file:
        try:
            full_path = path + file
            audio, _ = torchaudio.load(full_path)
            audio = transform(audio)
            all_scenes[counter] = {'path':full_path, 'audio':audio, 'sr':32000}
            counter += 1
        except Exception as e:
            print("error", e)
            


 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 22268/27131 [08:25<01:24, 57.89it/s]

error Error opening '../data/rendered_audio/rendered_audio/d270f326-a3f6-4807-ac06-8716c9166ad1.flac': Format not recognised.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27131/27131 [10:06<00:00, 44.70it/s]


# torch data generator

In [4]:
class AudioDataGenerator(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = self.data[idx]

        audio_tensor = sample['audio']
        return audio_tensor

# torch data module

In [5]:
class AudioDataModule(pl.LightningDataModule):
    def __init__(self, data, batch_size=32, num_workers=0, persistent_workers=False, shuffle=True):
        super().__init__()
        self.data = data
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.persistent_workers=persistent_workers
        self.shuffle = shuffle

    def setup(self, stage=None):
        # Split your data here if necessary, e.g., into train, validation, test
        self.dataset = AudioDataGenerator(self.data)

    def train_dataloader(self):
        return DataLoader(self.dataset, batch_size=self.batch_size, shuffle=self.shuffle, num_workers = self.num_workers, persistent_workers=self.persistent_workers)

    # Implement val_dataloader() and test_dataloader() if you have validation and test data

# Teh Model

In [6]:
class Autoencoder(pl.LightningModule):
    def __init__(self):
        super(Autoencoder, self).__init__()

        #losses
        self.loss_fn_1 = auraloss.freq.RandomResolutionSTFTLoss(
                    sample_rate=32000,
                    device="cuda"
                )
        self.loss_fn_2 = auraloss.time.SISDRLoss()
        self.loss_fn_3 = torch.nn.L1Loss()
        
        # Encoder
        self.enc_conv1 = nn.Conv1d(1, 8, kernel_size=33, stride=4, padding=16)
        self.enc_conv2 = nn.Conv1d(8, 16, kernel_size=17, stride=4, padding=8)
        self.enc_conv3 = nn.Conv1d(16, 32, kernel_size=9, stride=2, padding=4)
        self.enc_conv4 = nn.Conv1d(32, 64, kernel_size=9, stride=2, padding=4)
        self.enc_conv5 = nn.Conv1d(64,128, kernel_size=9, stride=2, padding=4)
        self.enc_conv6 = nn.Conv1d(128, 256, kernel_size=9, stride=2, padding=4)
        self.enc_conv7 = nn.Conv1d(256, 512, kernel_size=9, stride=2, padding=4)
        self.enc_conv8 = nn.Conv1d(512, 1024, kernel_size=9, stride=2, padding=4)
        
        # Decoder
        self.dec_conv1 = nn.ConvTranspose1d(1024, 512, kernel_size=9, stride=2, padding=4, output_padding=1)
        self.dec_conv2 = nn.ConvTranspose1d(512, 256, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv3 = nn.ConvTranspose1d(256, 128, kernel_size=9, stride=2, padding=5, output_padding=0)
        self.dec_conv4 = nn.ConvTranspose1d(128, 64, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv5 = nn.ConvTranspose1d(64,32, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv6 = nn.ConvTranspose1d(32, 16, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv7 = nn.ConvTranspose1d(16, 8, kernel_size=21, stride=4, padding=9, output_padding=0)
        self.dec_conv8 = nn.ConvTranspose1d(8, 1, kernel_size=37, stride=4, padding=22, output_padding=0)

    def forward(self, x):
        # Encoder
        x = self.enc_conv1(x)
        x = self.enc_conv2(x)
        x = self.enc_conv3(x)
        x = self.enc_conv4(x)
        x = self.enc_conv5(x)
        x = self.enc_conv6(x)
        x = self.enc_conv7(x)
        x = self.enc_conv8(x)
        encoded = x
        
        # Decoder
        x = self.dec_conv1(x)
        x = self.dec_conv2(x)
        x = self.dec_conv3(x)
        x = self.dec_conv4(x)
        x = self.dec_conv5(x)
        x = self.dec_conv6(x)
        x = self.dec_conv7(x)
        x = self.dec_conv8(x)

        x = x[:,:,:160000]
        return x, encoded

    def compute_loss(self, outputs, ref_signals):
        loss = self.loss_fn_1(outputs, ref_signals) + self.loss_fn_2(outputs, ref_signals) + self.loss_fn_3(outputs, ref_signals)
        return loss

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        audio = batch
        
        output_audio, encoded = self.forward(audio)
        # print(outputs.size())

        if batch_idx % 512 == 0:
            for ii in range(4):
                input_signal = audio[ii].cpu().detach().numpy().T
                reconstucted_signal = output_audio[ii].cpu().detach().numpy().T
                wandb.log({f'audio_input_{ii}': [wandb.Audio(input_signal, caption="Input", sample_rate=32000)]})
                wandb.log({f'audio_reconstructed_{ii}': [wandb.Audio(reconstucted_signal, caption="Reconstructed", sample_rate=32000)]})

        # print(audio.shape)
        # print(encoded.shape)
        # print(output_audio.shape)


        loss = self.compute_loss(output_audio, audio)         

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
        return loss

    def configure_optimizers(self):
        # Define your optimizer and optionally learning rate scheduler here
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
        return [optimizer], [scheduler]

# Callbacks 

In [7]:
class SaveModelEveryNSteps(pl.Callback):
    def __init__(self, save_step_frequency=512,):
        self.save_step_frequency = save_step_frequency
        self.save_path = "D://Github/timbre-tools-hack//Feature_Extraction//models//AE//"
        os.makedirs(self.save_path , exist_ok=True)

    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
        if (trainer.global_step + 1) % self.save_step_frequency == 0:
            checkpoint_path = os.path.join(self.save_path, f"step_{trainer.global_step + 1}.ckpt")
            trainer.save_checkpoint(checkpoint_path)

# Train Loop

In [8]:
model = Autoencoder()

In [9]:
wandb_logger = WandbLogger(project='TT-AE', log_model='all')

In [10]:
audio_data_module = AudioDataModule(all_scenes, batch_size=16, num_workers=0, persistent_workers=False)

In [11]:
trainer = pl.Trainer(
    max_epochs=1000,
    accelerator="gpu", 
    devices=-1,
    logger=wandb_logger,
    callbacks=[SaveModelEveryNSteps()],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Actually fit it

In [12]:
trainer.fit(model, audio_data_module)

[34m[1mwandb[0m: Currently logged in as: [33mhephyrius[0m. Use [1m`wandb login --relogin`[0m to force relogin


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name      | Type                     | Params
--------------------------------------------------------
0  | loss_fn_1 | RandomResolutionSTFTLoss | 0     
1  | loss_fn_2 | SISDRLoss                | 0     
2  | loss_fn_3 | L1Loss                   | 0     
3  | enc_conv1 | Conv1d                   | 272   
4  | enc_conv2 | Conv1d                   | 2.2 K 
5  | enc_conv3 | Conv1d                   | 4.6 K 
6  | enc_conv4 | Conv1d                   | 18.5 K
7  | enc_conv5 | Conv1d                   | 73.9 K
8  | enc_conv6 | Conv1d                   | 295 K 
9  | enc_conv7 | Conv1d                   | 1.2 M 
10 | enc_conv8 | Conv1d                   | 4.7 M 
11 | dec_conv1 | ConvTranspose1d          | 4.7 M 
12 | dec_conv2 | ConvTranspose1d          | 1.2 M 
13 | dec_conv3 | ConvTranspose1d          | 295 K 
14 | dec_conv4 | ConvTranspose1d          | 73.8 K
15 | dec_conv5 | ConvTranspose1d          | 18.5 K
16 | dec_conv6 | ConvTranspose1d 

Training: |                                                                                                   …

OSError: [Errno 28] No space left on device

#### 