#  melgan

In [1]:
import re
import glob
import matplotlib.pyplot as plt
import numpy as np 
from tqdm import tqdm
from IPython.display import HTML, Audio, display
import random
import os
import sys
import itertools
import librosa
import time
import yaml
os.environ['CUDA_VISIBLE_DEVICES'] = '7'
import torch
from mel2wav import MelVocoder

In [2]:
melgan = MelVocoder('logs/ssb_tacotron_130/',use_best = False, github=False)

In [3]:
from scipy import signal
emp = 0.99

class H() : 
    max_abs_value = 4.0
    min_level_db = -100

hparams = H()

def inv_preemphasis(wav, k, inv_preemphasize=True):
    if inv_preemphasize:
        return signal.lfilter([1], [1, -k], wav)
    return wav

def preemphasis(wav, k, preemphasize=True):
    if preemphasize:
        # wav = signal.lfilter([1, -0.75], [1], wav)
        return signal.lfilter([1, -k], [1], wav)
    return wav

def denormalize(D) : 
    return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
                + hparams.min_level_db)

def _db_to_amp(x):
    return np.power(10.0, (x) * 0.05)


# reconstruction and display

def recon_from_mel(load_mel, use_tacotron_process = False) : 
    print(load_mel.dtype)
    if use_tacotron_process : 
        load_mel = denormalize(load_mel) 
        load_mel = (load_mel + 20) / 20
        load_mel = _db_to_amp(load_mel + 20)
    print(load_mel.dtype)
    mel_tensor = torch.from_numpy(load_mel.T)[None]
    recon = melgan.inverse(mel_tensor).squeeze().cpu().numpy()
    print(f'mel_tensor : ({mel_tensor.size()}) -> recon_wav : ({recon.shape})')
    
    if use_tacotron_process : 
        recon = inv_preemphasis(recon, 0.90)

    return recon.astype(np.float32)

def display_waveform(wav) : 
    mel = melgan(torch.from_numpy(wav)[None])[0].detach().cpu().numpy()
    print(f'audio sample ({wav.shape}, {mel.shape}) : ')
    display(Audio(wav,rate=16000))

    fig, axs = plt.subplots(1,2, figsize=(20,3))
    axs[0].plot(wav)
    axs[1].imshow(mel, aspect='auto', cmap='coolwarm')
    
def write_waveform(wav, filename) : 
    librosa.output.write_wav(filename, wav, 16000)


In [None]:
# path='/Netdata/2020/TTS/wake_syn_mels/need_npy2wav.list'
# with open(path,'r') as p:
path ='/Netdata/shiyao/tf_multispeaker/yyg_synth_data/trial_1/mels/'
for file in os.listdir(path):
    t = os.path.join(path, file)
    a = np.load(t).squeeze().T
    a = a.astype(np.float32)
    print(a.shape)

    wav=recon_from_mel(a.T)
    write_waveform(wav, os.path.join('/Netdata/shiyao/tf_multispeaker/yyg_synth_data/trial_1/wav/',file.replace('.npy','.wav')))


In [1]:
logdir = 'logs-m2m-miya/test/logs-eval/wavs/'
syn_audios=[]
for file in os.listdir(logdir):
    if 'I' in file:
        print(file)
        wav,_=librosa.load(os.path.join(logdir,file),sr=16000)
        syn_audios.append(wav)

NameError: name 'os' is not defined

In [19]:
Audio(syn_audios[2], rate=16000)

In [None]:
logdir = 'logs-add-vcc/test/logs-eval/wavs/'
syn_audios=[]
for file in os.listdir(logdir):
    print(file)
    wav,_=librosa.load(os.path.join(logdir,file),sr=16000)
    syn_audios.append(wav)

In [21]:
Audio(syn_audios[2], rate=16000)

In [13]:
import os
import librosa
from tacotron.synthesizer import Synthesizer
from hparams import hparams
from tacotron.synthesize import generate_fast
import tensorflow as tf
from pypinyin import pinyin,lazy_pinyin,Style
import numpy as np
os.environ['CUDA_VISIBLE_DEVICES']='0'

spk_dir = os.path.join(os.path.dirname('new_vccdata/paper-sample.txt'), 'vcc-cmu-m2m.npy')
spk_uttid = np.load(spk_dir, allow_pickle=True).item()
def get_tar_embed(name):
        """
        spk=name[0:3]
        ids=self._spk_uttid[spk]
        choice = np.random.randint(0, len(ids))
        embed_name=spk+'-'+ids[choice]
        return embed_name
        """


        if 'vcc' in name:
            spklist=name.replace('embed-','').replace('.npy','').split('_')
            spk=spklist[0]+'_'+spklist[1]
            ids = spk_uttid[spk]

            #print(year_spk, ids)
            choice = np.random.randint(0, len(ids))
            embed=ids[choice]
            embed_name = 'embed-'+embed+'.npy'
            return embed_name
        elif 'arctic' in name:
            spk=name[0:3]
            ids=spk_uttid[spk]
            choice = np.random.randint(0, len(ids))
            embed_name=spk+'-'+ids[choice]
            return embed_name
        else:
            spklist=name.replace('embed-','').replace('.npy','').split('_')
            spk=spklist[0]
            ids = spk_uttid[spk]
            #print(spk, ids)
            choice = np.random.randint(0, len(ids))
            embed=ids[choice]
            embed_name = 'embed-'+embed+'.npy'
            return embed_name

hparams.tacotron_num_gpus=1
checkpoint = 'logs-2018/taco_pretrained'
checkpoint_path = tf.train.get_checkpoint_state(checkpoint).model_checkpoint_path
synth = Synthesizer()
synth.load(checkpoint_path, hparams,gta=False)

outdir = 'logs-2018/test/eval'
logdir = 'logs-2018/test/logs-eval'
os.makedirs(outdir, exist_ok=True)
os.makedirs(logdir, exist_ok=True)
os.makedirs(os.path.join(logdir, 'wavs'), exist_ok=True)
os.makedirs(os.path.join(logdir, 'plots'), exist_ok=True)
melpath='new_vccdata/vcc2018_evaluation/mels/'
embedpath='new_vccdata/g-vector/'
with open('new_vccdata/vcc2018_evaluation/2018-eval.txt','r') as f:
    ff=f.readlines()[0:4]
    for line in ff:
        wlist = line.strip().split('|')
        print(wlist)
        sur_embed=[np.load(os.path.join(embedpath,wlist[1]))]
        name=get_tar_embed(wlist[3])
        tar_embed=[np.load(os.path.join(embedpath,name))]
        mel_path = [os.path.join(melpath,wlist[0])]
        print(mel_path)
        index=wlist[0].replace('mel-','').replace('.npy','')
        sn=index.split('_')
        tar=wlist[2].replace('mel-','').replace('.npy','')
        tn=tar.split('_')
        basename = [ sn[1]+'-'+sn[2]+'-'+tn[1]]
         #mel_filenames = ['logs-Tacotron/test/mel-eval/test_' + str(index)]
        wav = synth.synthesize(mel_path, sur_embed, tar_embed,  basename, outdir, logdir,None )

print('synthesis finish')

Constructing model: Tacotron
initialisation done /gpu:0
Initialized Tacotron model. Dimensions (? = dynamic shape): 
  Train mode:               False
  Eval mode:                False
  GTA mode:                 False
  Synthesis mode:           True
  Input:                    (?, ?, 80)
  device:                   0
  encoder out:              (?, ?, 1024)
  decoder out:              (?, ?, 80)
  residual out:             (?, ?, 512)
  projected residual out:   (?, ?, 80)
  mel out:                  (?, ?, 80)
  <stop_token> out:         (?, ?)
  Tacotron Parameters       24.541 Million.
Loading checkpoint: logs-2018/taco_pretrained/tacotron_model.ckpt-102000
INFO:tensorflow:Restoring parameters from logs-2018/taco_pretrained/tacotron_model.ckpt-102000
['mel-vcc2018_SF1_30001.npy', 'embed-vcc2018_SF1_30001.npy', 'mel-vcc2018_slt_30001.npy', 'slt-mel-arctic_b0535.npy']


FileNotFoundError: [Errno 2] No such file or directory: 'new_vccdata/g-vector/embed-vcc2018_SF1_30001.npy'

In [2]:
import librosa
import librosa.filters
import numpy as np
import tensorflow as tf
from scipy import signal
from scipy.io import wavfile

In [63]:
y,_=librosa.core.load('waveglow-mix.wav', sr=16000)
print(len(y))
y=librosa.effects.trim(y, top_db= hparams.trim_top_db, frame_length=hparams.trim_fft_size, hop_length=hparams.trim_hop_size)[0]
print(len(y))

72400
57216


In [80]:
from datasets import audio
from hparams import hparams
_mel_basis = None
_inv_mel_basis = None
def get_hop_size(hparams):
	hop_size = hparams.hop_size
	if hop_size is None:
		assert hparams.frame_shift_ms is not None
		hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
	return hop_size
def _linear_to_mel(spectogram, hparams):
	global _mel_basis
	if _mel_basis is None:
		_mel_basis = _build_mel_basis(hparams)
    
	return np.dot(_mel_basis, spectogram)

def _build_mel_basis(hparams):
	assert hparams.fmax <= hparams.sample_rate // 2
	return librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels,
							   fmin=hparams.fmin, fmax=hparams.fmax)

def _amp_to_db(x, hparams):
	min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
	return 20 * np.log10(np.maximum(min_level, x))
def _normalize(S, hparams):
	if hparams.allow_clipping_in_normalization:
		if hparams.symmetric_mels:
			return np.clip((2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,
			 -hparams.max_abs_value, hparams.max_abs_value)
		else:
			return np.clip(hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db)), 0, hparams.max_abs_value)

	assert S.max() <= 0 and S.min() - hparams.min_level_db >= 0
	if hparams.symmetric_mels:
		return (2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value
	else:
		return hparams.max_abs_value * ((S - hparams.min_level_db) / (-hparams.min_level_db))

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 15)

In [65]:
D=librosa.stft(y=y, n_fft=hparams.n_fft, hop_length=get_hop_size(hparams), win_length=hparams.win_size, pad_mode='constant')
D.shape
X=_linear_to_mel(np.abs(D)**hparams.magnitude_power, hparams)
X

array([[ 0.16492915,  0.15278311,  0.03522913, ...,  0.00281994,
         0.00370979,  0.01295438],
       [ 0.1792833 ,  0.15739952,  0.0234163 , ...,  0.02166474,
         0.01651033,  0.02399624],
       [ 0.12782892,  0.10772665,  0.03290429, ...,  0.34709217,
         0.25278702,  0.14477078],
       ..., 
       [ 0.00934764,  0.00888335,  0.00727561, ...,  0.00350095,
         0.00246946,  0.00152844],
       [ 0.00999921,  0.00808394,  0.00559155, ...,  0.00542392,
         0.00428026,  0.00226168],
       [ 0.00708625,  0.00763198,  0.00699472, ...,  0.01130883,
         0.00653545,  0.00270368]])

In [66]:
s=_amp_to_db(X,hparams) - hparams.ref_level_db



In [67]:
Y=_normalize(s, hparams)
Y.shape

(80, 287)

In [12]:
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))

In [13]:
20 * np.log10(min_level)


-100.00000000000001

In [37]:
(2 * hparams.max_abs_value) * ((S - hparams.min_level_db) / (-hparams.min_level_db)) - hparams.max_abs_value,

NameError: name 'S' is not defined

In [76]:
import numpy
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)

    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param samplerate: the sample rate of the signal we are working with, in Hz. Affects mel spacing.
    :param lowfreq: lowest band edge of mel filters, default 0 Hz
    :param highfreq: highest band edge of mel filters, default samplerate/2
    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
    """
    highfreq= highfreq or samplerate/2
    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"

    # compute points evenly spaced in mels
    lowmel = hz2mel(lowfreq)
    print(lowmel)
    highmel = hz2mel(highfreq)
    print(highmel)
    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
    print(melpoints)
    # our points are in Hz, but we use fft bins, so we have to convert
    #  from Hz to fft bin number
    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
    print(len(bin))

    fbank = numpy.zeros([nfilt,nfft//2+1])
    for j in range(0,nfilt):
        for i in range(int(bin[j]), int(bin[j+1])):
            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
        for i in range(int(bin[j+1]), int(bin[j+2])):
            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
    return fbank

def hz2mel(hz):
    """Convert a value in Hertz to Mels

    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
    """
    return 2595 * numpy.log10(1+hz/700.)

def mel2hz(mel):
    """Convert a value in Mels to Hertz

    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
    """
    return 700*(10**(mel/2595.0)-1)


In [78]:
a=get_filterbanks(nfilt=80,nfft=800,samplerate=16000,lowfreq=55,highfreq=7600)
a.max()

85.2429256407
2786.97823588
[   85.24292564   118.59768256   151.95243947   185.30719639   218.66195331
   252.01671022   285.37146714   318.72622406   352.08098097   385.43573789
   418.79049481   452.14525172   485.50000864   518.85476556   552.20952247
   585.56427939   618.91903631   652.27379322   685.62855014   718.98330705
   752.33806397   785.69282089   819.0475778    852.40233472   885.75709164
   919.11184855   952.46660547   985.82136239  1019.1761193   1052.53087622
  1085.88563314  1119.24039005  1152.59514697  1185.94990389  1219.3046608
  1252.65941772  1286.01417464  1319.36893155  1352.72368847  1386.07844539
  1419.4332023   1452.78795922  1486.14271613  1519.49747305  1552.85222997
  1586.20698688  1619.5617438   1652.91650072  1686.27125763  1719.62601455
  1752.98077147  1786.33552838  1819.6902853   1853.04504222  1886.39979913
  1919.75455605  1953.10931297  1986.46406988  2019.8188268   2053.17358372
  2086.52834063  2119.88309755  2153.23785447  2186.59261138 

1.0