In [1]:
import auraloss
import collections
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import plotly.graph_objects as go
import pretty_midi
import pytorch_lightning as pl
import pywt
import random
import scipy.signal
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torchaudio
from torch.utils.data import DataLoader, Dataset
import wandb
from pytorch_lightning.loggers import WandbLogger
from tqdm import tqdm
import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import matplotlib
print(torch.cuda.is_available())


seed_value = 3407
torch.manual_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
torch.cuda.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.set_float32_matmul_precision('high')

True


In [2]:
class Autoencoder(pl.LightningModule):
    def __init__(self):
        super(Autoencoder, self).__init__()

        #losses
        self.loss_fn_1 = auraloss.freq.RandomResolutionSTFTLoss(
                    sample_rate=32000,
                    device="cuda"
                )
        self.loss_fn_2 = auraloss.time.SISDRLoss()
        self.loss_fn_3 = torch.nn.L1Loss()
        
        # Encoder
        self.enc_conv1 = nn.Conv1d(1, 8, kernel_size=33, stride=4, padding=16)
        self.enc_conv2 = nn.Conv1d(8, 16, kernel_size=17, stride=4, padding=8)
        self.enc_conv3 = nn.Conv1d(16, 32, kernel_size=9, stride=2, padding=4)
        self.enc_conv4 = nn.Conv1d(32, 64, kernel_size=9, stride=2, padding=4)
        self.enc_conv5 = nn.Conv1d(64,128, kernel_size=9, stride=2, padding=4)
        self.enc_conv6 = nn.Conv1d(128, 256, kernel_size=9, stride=2, padding=4)
        self.enc_conv7 = nn.Conv1d(256, 512, kernel_size=9, stride=2, padding=4)
        self.enc_conv8 = nn.Conv1d(512, 1024, kernel_size=9, stride=2, padding=4)
        
        # Decoder
        self.dec_conv1 = nn.ConvTranspose1d(1024, 512, kernel_size=9, stride=2, padding=4, output_padding=1)
        self.dec_conv2 = nn.ConvTranspose1d(512, 256, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv3 = nn.ConvTranspose1d(256, 128, kernel_size=9, stride=2, padding=5, output_padding=0)
        self.dec_conv4 = nn.ConvTranspose1d(128, 64, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv5 = nn.ConvTranspose1d(64,32, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv6 = nn.ConvTranspose1d(32, 16, kernel_size=9, stride=2, padding=4, output_padding=0)
        self.dec_conv7 = nn.ConvTranspose1d(16, 8, kernel_size=21, stride=4, padding=9, output_padding=0)
        self.dec_conv8 = nn.ConvTranspose1d(8, 1, kernel_size=37, stride=4, padding=22, output_padding=0)

    def forward(self, x):
        # Encoder
        x = self.enc_conv1(x)
        x = self.enc_conv2(x)
        x = self.enc_conv3(x)
        x = self.enc_conv4(x)
        x = self.enc_conv5(x)
        x = self.enc_conv6(x)
        x = self.enc_conv7(x)
        x = self.enc_conv8(x)
        encoded = x
        
        # Decoder
        x = self.dec_conv1(x)
        x = self.dec_conv2(x)
        x = self.dec_conv3(x)
        x = self.dec_conv4(x)
        x = self.dec_conv5(x)
        x = self.dec_conv6(x)
        x = self.dec_conv7(x)
        x = self.dec_conv8(x)

        x = x[:,:,:160000]
        return x, encoded

    def decode(self, encoded):

        x = encoded
        
        # Decoder
        x = self.dec_conv1(x)
        x = self.dec_conv2(x)
        x = self.dec_conv3(x)
        x = self.dec_conv4(x)
        x = self.dec_conv5(x)
        x = self.dec_conv6(x)
        x = self.dec_conv7(x)
        x = self.dec_conv8(x)

        x = x[:,:,:160000]
        return x, encoded


In [3]:
model = Autoencoder.load_from_checkpoint('./final_model.ckpt')

In [26]:
path = "../data/rendered_audio/rendered_audio/"
files = os.listdir(path)
fs = []
embeddings = []
transform = torchaudio.transforms.Resample(44100, 32000)

file_0 = 90
file_1 = 10000
file_2 = 20000

for file in [file_0, file_1, file_2]:
    full_path = path + files[file]
    audio, _ = torchaudio.load(full_path)
    audio = transform(audio)
    
    _, embedding = model(audio.to(model.device).unsqueeze(0))

    
    embeddings.append(embedding)

            


In [86]:
embedding_0 = embeddings[0]
embedding_1 = embeddings[1]
embedding_2 = embeddings[2]

embedding_mid = ((embedding_0 *1) / 2) + ((embedding_1 * 1) / 3) + ((embedding_1 * 1) / 4)

In [87]:
output = model.decode(embedding_0)
ipd.Audio(output[0].detach().squeeze(0).cpu().numpy(), rate=44100)


In [88]:
output = model.decode(embedding_mid)
ipd.Audio(output[0].detach().squeeze(0).cpu().numpy(), rate=44100)


In [85]:
output = model.decode(embedding_1)
ipd.Audio(output[0].detach().squeeze(0).cpu().numpy(), rate=44100)
