#Set up the environment 

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/Voice_coversion

/content/gdrive/MyDrive/Voice_coversion


# Import libraries

In [3]:
!pip install wavenet_vocoder

Collecting wavenet_vocoder
  Downloading wavenet_vocoder-0.1.1.tar.gz (13 kB)
Building wheels for collected packages: wavenet-vocoder
  Building wheel for wavenet-vocoder (setup.py) ... [?25l[?25hdone
  Created wheel for wavenet-vocoder: filename=wavenet_vocoder-0.1.1-py3-none-any.whl size=12680 sha256=ca18b569a0422c92378daeb53f3cf0b48a025604e583b80ceb40c2c1eabdc965
  Stored in directory: /root/.cache/pip/wheels/45/b9/b3/5961fda4d2ba5bc9a8d416844b30d590f597674a690162766f
Successfully built wavenet-vocoder
Installing collected packages: wavenet-vocoder
Successfully installed wavenet-vocoder-0.1.1


In [4]:
import os
import pickle
import numpy as np
import librosa
import torch
import soundfile as sf
from math import ceil
from collections import OrderedDict
from speaker_encoder import D_VECTOR
from model import Generator
from synthesis import build_model
from synthesis import wavegen

# Check the device

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  print(torch.cuda.get_device_name(0))
  device = torch.device('cuda')
else:
  device = torch.device('cpu')

# Prepare the mel spectrogram

In [6]:
origin = 'p225'
target = 'p228'

origin_mel_dir = 'spmel_16khz/' + origin
target_mel_dir = 'spmel_16khz/' + target

In [7]:
_, _, origin_file_list = next(os.walk(origin_mel_dir))
_, _, target_file_list = next(os.walk(target_mel_dir))

origin_file_list = sorted(origin_file_list)
target_file_list = sorted(target_file_list)

# Load the speaker encoder

In [8]:
C = D_VECTOR(dim_input=80, dim_cell=768, dim_emb=256).eval().to(device)

In [9]:
c_checkpoint = torch.load('checkpoints/Speaker/3000000-BL.ckpt')
new_state_dict = OrderedDict()
for key, val in c_checkpoint['model_b'].items():
    new_key = key[7:]
    new_state_dict[new_key] = val
C.load_state_dict(new_state_dict)
print("Finish loading the speaker encoder")

Finish loading the speaker encoder


# Calculate the speaker embeddings

In [10]:
num_utterances = 10
len_crop = 128

org_embs = []
trg_embs = []
count = 0
for i in range(num_utterances):
  tmp = np.load(os.path.join(origin_mel_dir, origin_file_list[i]))
  # check if the 
  if tmp.shape[0] >= len_crop:
    count += 1
    left = np.random.randint(0, tmp.shape[0] - len_crop)
    mel_spec = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
    org_emb= C(mel_spec)
    org_embs.append(org_emb.detach().squeeze().cpu().numpy())

org_speaker_emb = np.mean(org_embs, axis=0)
print(count)

count = 0
for i in range(num_utterances):
  tmp = np.load(os.path.join(target_mel_dir, target_file_list[i]))
  # check if the 
  if tmp.shape[0] >= len_crop:
    count += 1
    left = np.random.randint(0, tmp.shape[0] - len_crop)
    mel_spec = torch.from_numpy(tmp[np.newaxis, left:left+len_crop, :]).cuda()
    trg_emb= C(mel_spec)
    trg_embs.append(org_emb.detach().squeeze().cpu().numpy())

trg_speaker_emb = np.mean(trg_embs, axis=0)
print(count)

10
10


In [11]:
C = C.to('cpu')
del c_checkpoint

# Load the generator

In [11]:
G = Generator(32,256,512,32).eval().to(device)
g_checkpoint = torch.load('checkpoints/AutoVC_custom_16khz/autovc_100000.ckpt',map_location='cuda:0')
G.load_state_dict(g_checkpoint['model_state_dict'])
print("Finish loading the generator")

Finish loading the generator


In [14]:
del g_checkpoint

# Load the WaveNet

In [12]:
wavenet = build_model().to(device)
wavenet_checkpoint = torch.load('checkpoints/Wavenet/checkpoint_step001000000_ema.pth',map_location='cuda:0')
wavenet.load_state_dict(wavenet_checkpoint["state_dict"])

<All keys matched successfully>

In [37]:
del wavenet_checkpoint

# Zero-pad the sequence if the sequence is too short

In [13]:
def pad_seq(x, base=32):
    len_out = int(base * ceil(float(x.shape[0])/base))
    len_pad = len_out - x.shape[0]
    assert len_pad >= 0
    return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

# Prepare the original mel spectrogram

In [14]:
#select = np.random.randint(0,num_utterances)
#print("Selected {}".format(origin_file_list[select]))
select = 0
print("Selected {}".format(origin_file_list[select]))
x_org = np.load(os.path.join(origin_mel_dir, origin_file_list[select]))

# Zero pad the sequence if it is too short
x_org, len_pad = pad_seq(x_org)

uttr_org = torch.from_numpy(x_org[np.newaxis, :, :]).to(device)
org_speaker = torch.from_numpy(org_speaker_emb[np.newaxis, :]).to(device)
trg_speaker = torch.from_numpy(trg_speaker_emb[np.newaxis, :]).to(device)

Selected p225_001_mic1.npy


# Create the new mel spectrogram

In [15]:
with torch.no_grad():
  _, result_mel, _ = G(uttr_org, org_speaker, trg_speaker)

In [16]:
if len_pad == 0:
  uttr_trg = result_mel[0, 0, :, :].cpu().numpy()
else:
  uttr_trg = result_mel[0, 0, :-len_pad, :].cpu().numpy()

In [17]:
print(len(uttr_trg))

129


# Use the wavenet to generate wav files from the mel spectrogram

In [18]:
result_dir = 'demo/Results/'
output_name = origin + '_to_' + target + '.wav'

waveform = wavegen(model=wavenet, c=uttr_trg)

#librosa.output.write_wav(result_dir + output_name, waveform, sr=16000)

100%|██████████| 33024/33024 [04:49<00:00, 114.09it/s]


In [20]:
sf.write(result_dir + output_name, waveform, 16000)