### Demo

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import random
import yaml
from vocoder import Vocoder
from dataset import MNGU0Dataset, train_collate_fn, test_and_validation_collate_fn
from train import trainer_vocoder
from losses import MultiScaleSpectralLoss
from IPython.display import Audio
import soundfile as sf
import os
import json

Load Model

In [4]:
# set random seeds
torch.manual_seed(324)
torch.cuda.manual_seed(324)
np.random.seed(324)
random.seed(324)
# load yaml config
yaml_name = 'bilstm.yaml'
with open('../yamls/' + yaml_name, "r") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
device = config['device']        
# define dataloader
print('Loading dataset...')
train_dataset = MNGU0Dataset('train', config)
val_dataset = MNGU0Dataset('val', config)
test_dataset = MNGU0Dataset('test', config)
train_dataloader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=train_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=test_and_validation_collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, collate_fn=test_and_validation_collate_fn)
print(f'number of training samples: {len(train_dataset)}')
print(f'number of validation samples: {len(val_dataset)}')
# define model
print('Loading model...')
model = Vocoder(hidden_dim=config['hidden_dim'], nharmonics=config['nharmonics'], nbands=config['nbands'], attenuate=config['attenuate'], fs=config['fs'], framesize=config['framesize'])
model = model.to(device)
model.load_state_dict(torch.load(os.path.join(config["saved_models_dir"], 'Vocoder_'+config['comment']+'_best.pth'), map_location=device))

Loading dataset...
number of training samples: 1069
number of validation samples: 60
Loading model...


<All keys matched successfully>

In [5]:
# count number of parameters in model
nparam = 0
for p in model.parameters():
    if p.requires_grad:
        nparam += p.numel()
print(nparam)

4032246


Output predicted audio and ground truth from test split

In [10]:
x, ema, f0, loudness = next(iter(test_dataloader))
ema = ema.to(device).float()
f0 = f0.to(device).float()
loudness = loudness.to(device).float()

print("Ground Truth")
Audio(x.numpy().squeeze(), rate=16000)

Ground Truth


In [9]:
# run through the model
x_hat = model(f0, loudness, ema)

print("Prediction")
Audio(x_hat.detach().cpu().numpy().squeeze(), rate=16000)



Prediction
