In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='6'

from synthesizer.inference import Synthesizer
from IPython.display import Audio
import tensorflow as tf

import numpy as np
from pathlib import Path
import librosa
from tqdm import tqdm
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import random
%matplotlib inline
from sklearn.manifold import TSNE

In [3]:
def plot_spectrogram(pred_spectrogram, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
	if max_len is not None:
		target_spectrogram = target_spectrogram[:max_len]
		pred_spectrogram = pred_spectrogram[:max_len]

	if split_title:
		title = split_title_line(title)

	fig = plt.figure(figsize=(10, 8))
	# Set common labels
	fig.text(0.5, 0.18, title, horizontalalignment="center", fontsize=16)

	#target spectrogram subplot
	if target_spectrogram is not None:
		ax1 = fig.add_subplot(311)
		ax2 = fig.add_subplot(312)

		if auto_aspect:
			im = ax1.imshow(np.rot90(target_spectrogram), aspect="auto", interpolation="none")
		else:
			im = ax1.imshow(np.rot90(target_spectrogram), interpolation="none")
		ax1.set_title("Target Mel-Spectrogram")
		fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax1)
		ax2.set_title("Predicted Mel-Spectrogram")
	else:
		ax2 = fig.add_subplot(211)

	if auto_aspect:
		im = ax2.imshow(np.rot90(pred_spectrogram), aspect="auto", interpolation="none")
	else:
		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation="none")
	fig.colorbar(mappable=im, shrink=0.65, orientation="horizontal", ax=ax2)

	plt.tight_layout()
	fig.show()

In [4]:
# os.environ['CUDA_VISIBLE_DEVICES']='7'

Synthesizer.hparams.tacotron_num_gpus = 1

Synthesizer.hparams.postnet_num_layers = 5
Synthesizer.hparams.outputs_per_step=2


In [4]:
from vocoder.inference import load_model, infer_waveform
wavrnn = load_model('/home/server/Real-Time-Voice-Cloning/vocoder/saved_models/vctk/vctk.pt')

Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at /home/server/Real-Time-Voice-Cloning/vocoder/saved_models/vctk/vctk.pt


In [5]:

checkpoints_dir = 'synthesizer/saved_models/logs-best_of_ssb_2_gpus/taco_pretrained/'
checkpoints_dir = Path(checkpoints_dir)

synthesizer=Synthesizer(checkpoints_dir,low_mem=False)


Found synthesizer "best_of_ssb_2_gpus" trained to step 116000


In [41]:
#texts=['He would not do that.']
texts=['There were many editions of these works still being used in the nineteenth century.']
# texts=['i am good today']
# validation set: p260_195 p260_030
# test set: p340_355 p231_010
name = 'p260_195' # male p285_342, 
#embed=np.load('dataset/vctk/synthesizer/embed_gvector2/embed-p287_117.wav.npy')

# embed-p285_342.wav.npy
embed=np.load('datasets/vctk_clean/synthesizer/embeds/embed-' + name + '.wav.npy')
embeds = np.stack([embed] * len(texts))
specs = synthesizer.synthesize_spectrograms(texts, embeds)
breaks = [spec.shape[1] for spec in specs]

spec = np.concatenate(specs, axis=1)

assert spec is not None

wav = Synthesizer.griffin_lim(spec)

In [42]:
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
wav = wav / np.abs(wav).max() * 0.97

In [43]:
Audio(wav,rate=Synthesizer.sample_rate)

In [44]:
neural_wav = infer_waveform(spec)
Audio(wav,rate=Synthesizer.sample_rate)

{| ████████████████ 57000/57600 | Batch Size: 6 | Gen Rate: 9.9kHz | }

In [None]:
librosa.output.write_wav('/home/server/Real-Time-Voice-Cloning/temp/0308/vctk_p340.wav', neural_wav, sr=16000)

In [None]:
Audio(np.load('datasets/vctk/synthesizer/audio/audio-' + name + '.wav.npy'), rate=16000)

In [None]:
import librosa
librosa.output.write_wav('vctk.wav', wav, sr=16000)

## mel-spect extraction

In [18]:
# mel_file name, embeded filepath, text
pairs = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
#with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/val.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        pairs.append((items[1], items[2], [items[-1]]))

In [19]:


save_dir = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_mels/'
os.makedirs(save_dir, exist_ok=True)
for pair in tqdm(pairs):
    mel_filename, embeded_filepath, texts = pair
    try:
        embed=np.load('datasets/vctk_clean/synthesizer/embed_gvector3/' + embeded_filepath)
        embeds = np.stack([embed] * len(texts))
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        assert spec is not None
        np.save(save_dir + mel_filename, spec)
    except:
        print('embedding not found: ' + embeded_filepath)

100%|██████████| 1421/1421 [06:31<00:00,  3.63it/s]


## test set random embedding synthesis

In [22]:
# mel_file name, embeded filepath, text
spk2pairs = {}
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
#with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/val.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        spk = items[1][4:8]
        if spk not in spk2pairs.keys():
            spk2pairs[spk] = []
        spk2pairs[spk].append((items[1], items[2], [items[-1]]))

In [23]:
save_dir = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_rande_mels/'
os.makedirs(save_dir, exist_ok=True)
pwf = open(save_dir + 'embed_mel_pairs.txt', 'w')

for spk in tqdm(spk2pairs.keys()):
    numutt = len(spk2pairs[spk])
    for i in range(numutt):
        indexs = [j for j in range(numutt)]
        random.shuffle(indexs)
        while indexs[0] == i:
            random.shuffle(indexs)
        embed_id = indexs[0]
        
        mel_filename, _, texts = spk2pairs[spk][i]
        _, embeded_filepath, _ = spk2pairs[spk][embed_id]
        try:
            embed=np.load('datasets/vctk_clean/synthesizer/embed_gvector3/' + embeded_filepath)
            embeds = np.stack([embed] * len(texts))
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            breaks = [spec.shape[1] for spec in specs]
            spec = np.concatenate(specs, axis=1)
            assert spec is not None
            np.save(save_dir + mel_filename, spec)
            pwf.write(mel_filename + '|' + embeded_filepath + '|' + texts[0] + ' \n')
        except:
            print('embedding not found: ' + embeded_filepath)

100%|██████████| 8/8 [06:37<00:00, 49.74s/it]


## Librispeech synthesis

In [24]:
pairs = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        pairs.append((items[1], items[2], [items[-1]]))

In [25]:
from tqdm import tqdm

save_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_mels_nj/'
os.makedirs(save_dir, exist_ok=True)
for pair in tqdm(pairs):
    mel_filename, embeded_filepath, texts = pair
    try:
        embed=np.load('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/embeds_gvector/' + embeded_filepath)
        embeds = np.stack([embed] * len(texts))
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        assert spec is not None
        np.save(save_dir + mel_filename, spec)
    except:
        print('embedding not found: ' + embeded_filepath)



100%|██████████| 2489/2489 [17:06<00:00,  2.42it/s]


## Librispeech synthesis with random embedding from the same speaker

In [26]:
spk2pairs = {}
with open('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        spk = items[1].split('-')[1]
        if spk not in spk2pairs.keys():
            spk2pairs[spk] = []
        spk2pairs[spk].append((items[1], items[2], [items[-1]]))

In [27]:
save_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_rande_mels_nj/'
os.makedirs(save_dir, exist_ok=True)
pwf = open(save_dir + 'embed_mel_pairs.txt', 'w')

for spk in tqdm(spk2pairs.keys()):
    numutt = len(spk2pairs[spk])
    for i in range(numutt):
        indexs = [j for j in range(numutt)]
        random.shuffle(indexs)
        while indexs[0] == i:
            random.shuffle(indexs)
        embed_id = indexs[0]
        
        mel_filename, _, texts = spk2pairs[spk][i]
        _, embeded_filepath, _ = spk2pairs[spk][embed_id]
        try:
            embed=np.load('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/embeds_gvector/' + embeded_filepath)
            embeds = np.stack([embed] * len(texts))
            specs = synthesizer.synthesize_spectrograms(texts, embeds)
            breaks = [spec.shape[1] for spec in specs]
            spec = np.concatenate(specs, axis=1)
            assert spec is not None
            np.save(save_dir + mel_filename, spec)
            pwf.write(mel_filename + '|' + embeded_filepath + '|' + texts[0] + ' \n')
        except:
            print('embedding not found: ' + embeded_filepath)
pwf.close()

100%|██████████| 8/8 [17:32<00:00, 131.62s/it]


## MCD calculation

In [28]:
def melcd(array1, array2):
    """Calculate mel-cepstrum distortion
    Calculate mel-cepstrum distortion between the arrays.
    This function assumes the shapes of arrays are same.
    Parameters
    ----------
    array1, array2 : array, shape (`T`, `dim`) or shape (`dim`)
        Arrays of original and target.
    Returns
    -------
    mcd : scala, number > 0
        Scala of mel-cepstrum distortion
    """
    if array1.shape != array2.shape:
        raise ValueError(
            "The shapes of both arrays are different \
            : {} / {}".format(array1.shape, array2.shape))

    if array1.ndim == 2:
        # array based melcd calculation
        diff = array1 - array2
        mcd = 10.0 / np.log(10) \
            * np.mean(np.sqrt(2.0 * np.sum(diff ** 2, axis=1)))
    elif array1.ndim == 1:
        diff = array1 - array2
        mcd = 10.0 / np.log(10) * np.sqrt(2.0 * np.sum(diff ** 2))
    else:
        raise ValueError("Dimension mismatch")

    return mcd

In [2]:
from fastdtw import fastdtw
def estimate_twf(orgdata, tardata, distance='melcd', fast=True, otflag=None):
    """time warping function estimator
    Parameters
    ---------
    orgdata : array, shape(`T_org`, `dim`)
        Array of source feature
    tardata : array, shape(`T_tar`, `dim`)
        Array of target feature
    distance : str, optional
        distance function
        `melcd` : mel-cepstrum distortion
    fast : bool, optional
        Use fastdtw instead of dtw
        Default set to `True`
    otflag : str,
        Perform alignment into either original or target length
        `org` : align into original length
        `tar` : align into target length
        Default set to None
    Returns
    ---------
    twf : array, shape(`2`, `T`)
        Time warping function between original and target
    """

    if distance == 'melcd':
        def distance_func(x, y): return melcd(x, y)
    else:
        raise ValueError('other distance metrics than melcd does not support.')

    if otflag is None:
        # use dtw or fastdtw
        if fast:
            mcd_distance, path = fastdtw(orgdata, tardata, dist=distance_func)
            twf = np.array(path).T
        else:
            _, _, _, twf = dtw(orgdata, tardata, distance_func)
    else:
        # use dtw_c to align target/original feature vector
        ldim = orgdata.shape[1] - 1
        if otflag == 'org':
            _, twf, _, _ = dtw_c.dtw_org_to_trg(tardata, orgdata,
                                                0, ldim, 5.0, 100.0, 100.0)
        else:
            _, twf, _, _ = dtw_c.dtw_org_to_trg(orgdata, tardata,
                                                0, ldim, 5.0, 100.0, 100.0)
        twf[:, 1] = np.array(range(twf.shape[0]))  # replace target index by frame number
        twf = twf.T
        if otflag == 'org':
            twf = twf[::-1, :]  # swap cols
            assert twf.shape[0] == orgdata.shape[0]
        else:
            assert twf.shape[1] == tardata.shape[0]

    return twf, mcd_distance

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

In [30]:
mel_files = []
#with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/val.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        mel_files.append(items[1])
mel_files[1:3]

['mel-p260_176.wav.npy', 'mel-p260_189.wav.npy']

In [31]:
ori_filepath = '/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/mels/'
#syn_filepath = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_mels/'
syn_filepath = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_rande_mels/'
#mcd_report = open('synthesizer/saved_models/logs-vctk_clean_gvector3/reports/mcd_report', 'w')
mcd_report = open('synthesizer/saved_models/logs-vctk_clean_gvector3/reports/mcd_rande_report', 'w')
mcds = []
for mel in tqdm(mel_files):
    try:
        ori_mel = np.load(ori_filepath + mel) / Synthesizer.hparams.max_abs_value
        syn_mel = np.load(syn_filepath + mel).T / Synthesizer.hparams.max_abs_value
        assert ori_mel.shape[1] == syn_mel.shape[1]
        alignment, _ = estimate_twf(ori_mel, syn_mel)
        mcd = melcd(ori_mel[alignment[0]], syn_mel[alignment[1]])
        mcds.append(mcd)
        mcd_report.write(mel + ' ' + str(mcd) + '\n')
    except:
        print('file not found: ' + mel)
mcd_report.close()    

100%|██████████| 800/800 [00:31<00:00, 25.75it/s]


In [32]:
np.mean(mcds)

9.311786162603786

## Librispeech MCD

In [None]:
mel_files = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        mel_files.append(items[1])
mel_files[1:3]

In [None]:
ori_filepath = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/mels/'
syn_filepath = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_mels_nj/'
mcds = []
for mel in tqdm(mel_files):
    try:
        ori_mel = np.load(ori_filepath + mel) / Synthesizer.hparams.max_abs_value
        syn_mel = np.load(syn_filepath + mel).T / Synthesizer.hparams.max_abs_value
        assert ori_mel.shape[1] == syn_mel.shape[1]
        alignment, _ = estimate_twf(ori_mel, syn_mel)
        mcd = melcd(ori_mel[alignment[0]], syn_mel[alignment[1]])
        mcds.append(mcd)
    except:
        print('file not found: ' + mel)
mcd_report.close()    

In [None]:
np.mean(mcds)

## Embedding extraction -> /home/server/workspace/tensor_spv_zx/get_gvector.ipynb

## embedding l2 distance 

In [None]:
embed_files = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        embed_files.append(items[2])
scores = []

org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/embeds/'
syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/synthesizer/saved_models/logs-vctk_clean_gvector3/syn_gvectors/'
for embed in embed_files:
    try:
        org_embed = np.load(org_embed_dir + embed)
        syn_embed = np.load(syn_embed_dir + embed)
        scores.append(np.linalg.norm(org_embed-syn_embed))
    except:
        print('file not found: ' + embed)

In [None]:
print(np.mean(scores))

## embedding cosine distance

In [13]:
embed_files = []
#with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/val.txt') as testfile:
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        embed_files.append(items[2])
cos_scores = []

org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/embeds/'
#syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/synthesizer/saved_models/logs-vctk_clean_gvector3/syn_gvectors/'
syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/synthesizer/saved_models/logs-vctk_clean_gvector3/syn_rande_gvectors/'
for embed in embed_files:
    try:
        org_embed = np.load(org_embed_dir + embed)
        syn_embed = np.load(syn_embed_dir + embed)
        cos_scores.append(normalize(org_embed).dot(normalize(syn_embed).T))
    except:
        print('file not found: ' + embed)

In [14]:
print(np.mean(cos_scores))

0.33282506


## Librispeech cosine distance

In [17]:
embed_files = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        embed_files.append(items[2])
cos_scores = []

org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/embeds_gvector/'
#syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector_nj/'
#syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector/'
#syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector_rande/'
syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector_rande_nj/'
for embed in embed_files:
    try:
        org_embed = np.load(org_embed_dir + embed)
        syn_embed = np.load(syn_embed_dir + embed)
        cos_scores.append(normalize(org_embed).dot(normalize(syn_embed).T))
    except:
        print('file not found: ' + embed)

In [18]:
print(np.mean(cos_scores))

0.1385242


## EER calculation -> /home/server/workspace/tensor_spv_zx/get_eer.ipynb

# FINAL RESULTS

### TEST set:
 - MCD: 9.31  
 - COSSIM: 0.377186  
 - EER: 15.017  

### VAL set:
 - MCD: 8.43
 - COSSIM: 0.4268
 - EER: 11.38
 
# using other embedding form the same speaker
### TEST set:
 - MCD 9.919744882840039
 
### VAL set:
 - MCD 9.510865505031804

## Wav synthesis

In [None]:
# mel_file name, embeded filepath, text
pairs = []
#with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/val.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        pairs.append((items[1], items[0]))

In [None]:
save_dir = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_wavs/'
mel_dir = 'synthesizer/saved_models/logs-vctk_clean_gvector3/syn_mels/'
os.makedirs(save_dir, exist_ok=True)
for pair in tqdm(pairs):
    mel_filename, wav_filepath = pair
    try:
        mel=np.load(mel_dir + mel_filename)
        neural_wav = infer_waveform(mel)
        librosa.output.write_wav(save_dir + wav_filepath, neural_wav, sr=16000)
    except:
        print('mel not found: ' + mel_filename)

## Test Set PCA visulization

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
%matplotlib inline
pca = PCA(n_components=2)

In [None]:
embed_files = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        items = line.strip().split('|')
        embed_files.append(items[2])
embed_files[:2]

In [None]:
spk = [i[6:10] for i in embed_files]
spk[:2]

In [None]:
mapping = dict([(v,str(i)) for i,v in enumerate(list(set(spk)))])
mapping

In [None]:
syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/synthesizer/saved_models/logs-vctk_clean_gvector3/syn_gvectors/'
embeds = np.array([np.load(syn_embed_dir + x).tolist() for x in embed_files])
embeds.shape

In [None]:
pca_matrix = PCA(n_components=2)

In [None]:
pca_embed = pca_matrix.fit_transform(embeds)
pca_embed.shape

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    #print(indexes)
    #print(pca_embed[indexes][0])
    #print(i ,len(indexes))
    ax.scatter(pca_embed[indexes, 0], pca_embed[indexes, 1], c='C' + mapping[i], s=20, label = i)
    
ax.legend()
ax.grid()

In [None]:
org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/embeds/'
org_embeds = np.array([np.load(org_embed_dir + x).tolist() for x in embed_files])

pca_embed_mix = pca_matrix.fit_transform(np.concatenate([embeds, org_embeds]))



In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    org_indexes = indexes + len(spk)
    #print(indexes)
    #print(pca_embed[indexes][0])
    plt.xlim(np.min(pca_embed_mix[:, 0]) - 20, np.max(pca_embed_mix[:, 0]))
    ax.scatter(pca_embed_mix[indexes, 0], pca_embed_mix[indexes, 1], c='C' + mapping[i], s=20, label = i)
    ax.scatter(pca_embed_mix[org_indexes, 0], pca_embed_mix[org_indexes, 1], c='C' + mapping[i], s=20, label = i, marker='*')
ax.legend()
ax.grid()

# TSNE

In [None]:


tsne_embed_mix = TSNE(n_components=2).fit_transform(np.concatenate([embeds, org_embeds]))

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    org_indexes = indexes + len(spk)
    #print(indexes)
    #print(pca_embed[indexes][0])
    plt.xlim(np.min(tsne_embed_mix[:, 0]) - 25, np.max(tsne_embed_mix[:, 0]))
    ax.scatter(tsne_embed_mix[indexes, 0], tsne_embed_mix[indexes, 1], c='C' + mapping[i], s=15, label = i, marker='x')
    ax.scatter(tsne_embed_mix[org_indexes, 0], tsne_embed_mix[org_indexes, 1], c='C' + mapping[i], s=15, label = i, marker='^')
ax.legend()
ax.grid()
#plt.show()

# T-SNE for random embedding

In [None]:
Rorg_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/vctk_clean/synthesizer/embeds/'
Rorg_embeds = np.array([np.load(Rorg_embed_dir + x).tolist() for x in embed_files])
Rsyn_embed_dir = '/home/server/Real-Time-Voice-Cloning/synthesizer/saved_models/logs-vctk_clean_gvector3/syn_rande_gvectors/'
Rembeds = np.array([np.load(Rsyn_embed_dir + x).tolist() for x in embed_files])

In [None]:
Rtsne_embed_mix = TSNE(n_components=2).fit_transform(np.concatenate([Rembeds, Rorg_embeds]))

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    org_indexes = indexes + len(spk)
    #print(indexes)
    #print(pca_embed[indexes][0])
    plt.xlim(np.min(Rtsne_embed_mix[:, 0]) - 25, np.max(Rtsne_embed_mix[:, 0]))
    ax.scatter(Rtsne_embed_mix[indexes, 0], Rtsne_embed_mix[indexes, 1], c='C' + mapping[i], s=15, label = i, marker='x')
    ax.scatter(Rtsne_embed_mix[org_indexes, 0], Rtsne_embed_mix[org_indexes, 1], c='C' + mapping[i], s=15, label = i, marker='^')
ax.legend()
ax.grid()

# TSNE Librispeech

In [None]:
lib_embed_files = []
with open('/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/test.txt') as testfile:
    for line in testfile.readlines():
        if '2230' in line:
            continue
        items = line.strip().split('|')
        lib_embed_files.append(items[2])
lib_embed_files[:2]

In [None]:
lib_org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/embeds_gvector/'
lib_org_embeds = np.array([np.load(lib_org_embed_dir + x).tolist() for x in lib_embed_files])
lib_syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector_nj/'
lib_embeds = np.array([np.load(lib_syn_embed_dir + x).tolist() for x in lib_embed_files])

In [None]:
lib_embed_mix = TSNE(n_components=2).fit_transform(np.concatenate([lib_embeds, lib_org_embeds]))

In [None]:
spk = [i.split('-')[1] for i in lib_embed_files]
spk[:2]

In [None]:
lib_mapping = dict([(v,str(i)) for i,v in enumerate(list(set(spk)))])
lib_mapping

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in lib_mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    org_indexes = indexes + len(spk)
    #print(indexes)
    #print(pca_embed[indexes][0])
    plt.xlim(np.min(lib_embed_mix[:, 0]) - 25, np.max(lib_embed_mix[:, 0]))
    ax.scatter(lib_embed_mix[indexes, 0], lib_embed_mix[indexes, 1], c='C' + lib_mapping[i], s=15, label = i, marker='x')
    ax.scatter(lib_embed_mix[org_indexes, 0], lib_embed_mix[org_indexes, 1], c='C' + lib_mapping[i], s=15, label = i, marker='^')
ax.legend()
ax.grid()

# TSNE Librispeech with random embedding

In [None]:
lib_org_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/embeds_gvector/'
lib_org_embeds = np.array([np.load(lib_org_embed_dir + x).tolist() for x in lib_embed_files])
lib_syn_embed_dir = '/home/server/Real-Time-Voice-Cloning/datasets/LibriSpeech/synthesizer/syn_gvector_rande_nj/'
lib_embeds = np.array([np.load(lib_syn_embed_dir + x).tolist() for x in lib_embed_files])

In [None]:
spk = [i.split('-')[1] for i in lib_embed_files]
lib_mapping = dict([(v,str(i)) for i,v in enumerate(list(set(spk)))])

In [None]:
lib_embed_mix = TSNE(n_components=2).fit_transform(np.concatenate([lib_embeds, lib_org_embeds]))

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 

for i in lib_mapping.keys():
    indexes = np.where(np.array(spk) == i)[0]
    org_indexes = indexes + len(spk)
    #print(indexes)
    #print(pca_embed[indexes][0])
    plt.xlim(np.min(lib_embed_mix[:, 0]) - 25, np.max(lib_embed_mix[:, 0]))
    ax.scatter(lib_embed_mix[indexes, 0], lib_embed_mix[indexes, 1], c='C' + lib_mapping[i], s=15, label = i, marker='x')
    ax.scatter(lib_embed_mix[org_indexes, 0], lib_embed_mix[org_indexes, 1], c='C' + lib_mapping[i], s=15, label = i, marker='^')
ax.legend()
ax.grid()