In [1]:
import torch
import numpy as np
from vqvae import VQVAE
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn as nn 
import os
from vctk_dataset import VCTK_Dataset
import gc

with_gpu = torch.cuda.is_available()

if with_gpu:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print('We are now using %s.' % device)

train_dataset_end_idx = 35121 # About 80% of the data with completely different Speakers in train and val

dataset_train = VCTK_Dataset(n_speakers=128, start_idx=0, end_idx=2048, root=r"C:\Users\JadHa\Desktop\Uni\Audio SP\VoiceConversion", download=False)
dataset_val = VCTK_Dataset(n_speakers=128, start_idx=train_dataset_end_idx, end_idx=train_dataset_end_idx+512, root=r"C:\Users\JadHa\Desktop\Uni\Audio SP\VoiceConversion", download=False)

print("Segmented train dataset size : %d"%len(dataset_train))
print("Segmented validation dataset size : %d"%len(dataset_val))

vqvae = VQVAE(in_channel=1).to(device)

print(sum(p.numel() for p in vqvae.parameters() if p.requires_grad))

optimizer = optim.Adam(params=vqvae.parameters(), lr=3e-4)

criterion = nn.MSELoss()

  from .autonotebook import tqdm as notebook_tqdm
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.
The torchaudio backend is switched to 'soundfile'. Note that 'sox_io' is not supported on Windows.


We are now using cuda.
Segmented train dataset size : 2048
Segmented validation dataset size : 512
1480448


In [2]:
from vctk_dataset import my_collate
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True, collate_fn=my_collate)
val_loader = DataLoader(dataset_val, batch_size=32, shuffle=True, collate_fn=my_collate)

In [3]:
torch.cuda.empty_cache()
gc.collect()
sample = next(iter(train_loader))
print(sample[0][0].shape)
print(sample[1][0].shape)

torch.Size([1, 80, 199])
torch.Size([128])


In [4]:
from speechbrain.pretrained import HIFIGAN

# Load a pretrained HIFIGAN Vocoder
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-16kHz", savedir="vocoder_16khz", run_opts={"device":"cuda"})

In [8]:
from IPython.display import Audio
torch.cuda.empty_cache()
gc.collect()
sample = next(iter(train_loader))
wav = hifi_gan.decode_batch(sample[0][0])
Audio(wav.squeeze().cpu().numpy(), rate=16000)

torch.Size([1, 80, 171])
torch.Size([1, 1, 46336])


In [11]:
epochs = 40
vq_loss_weight = 0.25
for epoch in range(epochs):
    epoch_loss = 0
    vqvae.train()
    for batch_idx, (input, speaker_id) in enumerate(train_loader):
        torch.cuda.empty_cache()
        gc.collect()
        optimizer.zero_grad()
        input = input.to(device)
        speaker_id = speaker_id.to(device)
        output, vq_loss = vqvae(input, speaker_id)
        recon_loss = criterion(output, input)
        vq_loss = vq_loss.mean()
        loss = recon_loss + vq_loss_weight * vq_loss
        epoch_loss += loss.detach().cpu()
        loss.backward()
        optimizer.step()
    print("Epoch %d , Training Loss : %.2f " %(epoch + 1, epoch_loss))
    # Validation
    vqvae.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch_idx, (input, speaker_id) in enumerate(val_loader):
            input = input.to(device)
            speaker_id = speaker_id.to(device)
            output, vq_loss = vqvae(input, speaker_id)
            recon_loss = criterion(output, input)
            vq_loss = vq_loss.mean()
            loss = recon_loss + vq_loss_weight * vq_loss
            epoch_loss += loss.detach().cpu()
    print("Epoch %d , Validation Loss : %.2f " %(epoch + 1, epoch_loss))
    chkpoint = {'model_state_dict': vqvae.state_dict()}
    torch.save(chkpoint, os.path.join("saved_models", "vqvae_vctk.pt"))
    print('VQ-VAE is stored at folder:{}'.format('saved_models/'+'vqvae_vctk.pt'))

torch.Size([1, 80, 183])


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 64, 20, 45]