# Imports

- select the voice_isolate_env kernel from top right of screen

In [15]:
from data import CleanDataset, DataTransformer, NoiseGenerator
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch
from pathlib import Path
from autoencoder import UNet, autoencoder_loss
# from vae import CustomVAE, vae_loss, AttnParams

output_folder = Path('../playground_outputs')
output_folder.mkdir(exist_ok=True)
model_path = Path('/Users/marcusbluestone/Desktop/MIT/Fall (25-26)/Voice_Isolation/outputs/model.pth')
device = 'mps'

# Download Dataset

In [None]:
dataset = CleanDataset(chunk_size = 50_000, split = 'dev-clean', count = 10)
beta = 0
sigma_noise = 0.01

dt = DataTransformer()
ng = NoiseGenerator()
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

# Test Model Results Audibly

In [17]:
waveform, sample_rate = next(iter(dataloader))
dt.waveform_to_audio(waveform, sample_rate, fname = output_folder / 'original')

amp_clean, phase_clean, _ = dt.waveform_to_spectrogram(waveform)
_, W,H = amp_clean.shape

noisy_waveform = ng.add_gaussian(waveform, sigma = sigma_noise)

amp_noisy, phase_noisy, minmax_info = dt.waveform_to_spectrogram(noisy_waveform)
dt.waveform_to_audio(noisy_waveform, sample_rate, output_folder / 'noisy')


In [18]:
# attn_params = AttnParams(num_heads=4, window_size=None, use_rel_pos_bias=False, dim_head=64)
# model = CustomVAE(in_channels=1, spatial_dims=2, use_attn=False, vae_latent_channels=16,
#                     attn_params=attn_params, vae_use_log_var = True, beta = beta, dropout_prob=0, blocks_down=(1,2,2,4),
#                     blocks_up = [1,1,1])
model = UNet(input_channels = 1, final_activation='tanh')
state = torch.load(model_path, map_location=device)
model.load_state_dict(state)
model.to(device)

UNet(
  (conv1): ConvBlock(
    (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (act): LeakyReLU(negative_slope=0.01, inplace=True)
  )
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): ConvBlock(
    (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (act): LeakyReLU(negative_slope=0.01, inplace=True)
  )
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): ConvBlock(
    (conv1): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (act): LeakyReLU(negative_slope=0.01, inplace=True)
  )
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): ConvBl

In [19]:
input = dt.add_padding(amp_noisy).unsqueeze(1).to(device)

output = model(input)
amp_recon = output[:, :, :W, :H]

# Loss
target = amp_clean.to(device).unsqueeze(1)
loss = autoencoder_loss(amp_recon, target)
amp_recon = amp_recon[:, 0, :, :]

print("Loss", loss)

dt.save_spectrograms(
    amps =   [amp_clean.detach(), amp_noisy.detach(), amp_recon.detach(), ],
    phases = [phase_clean.detach(), phase_noisy.detach(), phase_noisy.detach()],
    names = ['original', 'noisy', 'recon'], 
    out_dir = output_folder / 'spectrograms'
)
# dt.save_spectrogram(amp_clean.detach(), phase_clean.detach(), output_folder / 'input')
# dt.save_spectrogram(amp_recon.detach(), phase_noisy.detach(), output_folder / 'recon')
# dt.save_spectrogram(amp_noisy.detach(), phase_noisy.detach(), output_folder / 'noisy')

waveforms_reconstr = dt.spectrogram_to_waveform(amp_recon.cpu(), phase_noisy.cpu(), *minmax_info)
dt.waveform_to_audio(waveforms_reconstr, sample_rate = sample_rate, fname = output_folder / 'reconstr')

Loss tensor(0.1156, device='mps:0', grad_fn=<MseLossBackward0>)
