# StarGANv2-VC Demo (VCTK 20 Speakers)

### Utils

In [1]:
%cd ..

/workspace/StarGANv2-VC


In [2]:
pip install scipy==1.9.3

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import soundfile as sf

from Utils.ASR.models import ASRCNN
from Utils.JDC.model import JDCNet
from models import Generator, MappingNetwork, StyleEncoder

%matplotlib inline

In [4]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

speakers = [11,12,13,14,15,16,17,18,19,20]

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def build_model(model_params={}):
    args = Munch(model_params)
    generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
    
    nets_ema = Munch(generator=generator,
                     mapping_network=mapping_network,
                     style_encoder=style_encoder)

    return nets_ema

import numpy as np
import librosa, soundfile as sf

def load_wav_safe(path, sr=24000):
    try:
        y, in_sr = sf.read(path, dtype='float32', always_2d=False)
        if y.ndim > 1:
            y = y.mean(axis=1)
        if in_sr != sr:
            y = librosa.resample(y, orig_sr=in_sr, target_sr=sr)
        return y, sr
    except Exception:
        y, _sr = librosa.load(path, sr=sr, mono=True)
        return y.astype(np.float32), sr


def compute_style(speaker_dicts):
    reference_embeddings = {}
    for key, (path, speaker) in speaker_dicts.items():
        if path == "":
            label = torch.LongTensor([speaker]).to('cuda')
            latent_dim = starganv2.mapping_network.shared[0].in_features
            ref = starganv2.mapping_network(torch.randn(1, latent_dim).to('cuda'), label)
        else:
            wave, sr = load_wav_safe(path, sr=24000)   # ✅ 안전 로더 사용
            audio, index = librosa.effects.trim(wave, top_db=30)
            mel_tensor = preprocess(wave).to('cuda')
            with torch.no_grad():
                label = torch.LongTensor([speaker])
                ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)
        reference_embeddings[key] = (ref, label)
    return reference_embeddings

### Load models

In [5]:
# load F0 model

F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("Utils/JDC/bst.t7")['net']
F0_model.load_state_dict(params)
_ = F0_model.eval()
F0_model = F0_model.to('cuda')

In [6]:
# load vocoder
from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()



In [7]:
# load starganv2

model_path = 'Models/ESD_PVC/epoch_00150.pth'

with open('Models/ESD_PVC/config_PVC.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.style_encoder = starganv2.style_encoder.to('cuda')
starganv2.mapping_network = starganv2.mapping_network.to('cuda')
starganv2.generator = starganv2.generator.to('cuda')

### Conversion

In [8]:
# load input wave
selected_speakers = [16]
k = random.choice(selected_speakers)
wav_path = 'Demo/ESD_PVC-corpus/' + str(k) + '/00' + str(k) + '_000023.wav'
audio, source_sr = librosa.load(wav_path, sr=24000)
audio = audio / np.max(np.abs(audio))
audio.dtype = np.float32

#### Convert by style encoder

In [10]:
# with reference, using style encoder
speaker_dicts = {}
for s in selected_speakers:
    k = s
    for idx, path in enumerate([
        'Demo/ESD_PVC-corpus/' + str(k) + '/Neutral/00' + str(k) + '_000002.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Neutral/00' + str(k) + '_000008.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Neutral/00' + str(k) + '_000013.wav',

        'Demo/ESD_PVC-corpus/' + str(k) + '/Angry/00' + str(k) + '_000351.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Angry/00' + str(k) + '_000352.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Angry/00' + str(k) + '_000363.wav',

        'Demo/ESD_PVC-corpus/' + str(k) + '/Happy/00' + str(k) + '_000706.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Happy/00' + str(k) + '_000713.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Happy/00' + str(k) + '_000717.wav',

        'Demo/ESD_PVC-corpus/' + str(k) + '/Sad/00' + str(k) + '_001053.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Sad/00' + str(k) + '_001061.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Sad/00' + str(k) + '_001065.wav',

        'Demo/ESD_PVC-corpus/' + str(k) + '/Surprise/00' + str(k) + '_001403.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Surprise/00' + str(k) + '_001408.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Surprise/00' + str(k) + '_001414.wav',
        'Demo/ESD_PVC-corpus/' + str(k) + '/Surprise/00' + str(k) + '_001417.wav'
]):
        speaker_dicts[f"{s}_{idx}"] = (path, speakers.index(s))

reference_embeddings = compute_style(speaker_dicts)

In [11]:
# conversion 
import time
start = time.time()
    
source = preprocess(audio).to('cuda:0')
keys = []
converted_samples = {}
reconstructed_samples = {}
converted_mels = {}

for key, (ref, _) in reference_embeddings.items():
    with torch.no_grad():
        f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
        out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
        
        c = out.transpose(-1, -2).squeeze().to('cuda')
        y_out = vocoder.inference(c)
        y_out = y_out.view(-1).cpu()

        if key not in speaker_dicts or speaker_dicts[key][0] == "":
            recon = None
        else:
            wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
            mel = preprocess(wave)
            c = mel.transpose(-1, -2).squeeze().to('cuda')
            recon = vocoder.inference(c)
            recon = recon.view(-1).cpu().numpy()

    converted_samples[key] = y_out.numpy()
    reconstructed_samples[key] = recon

    converted_mels[key] = out
    
    keys.append(key)
end = time.time()
print('total processing time: %.3f sec' % (end - start) )

import IPython.display as ipd
for key, wave in converted_samples.items():
    print('Converted: %s' % key)
    display(ipd.Audio(wave, rate=24000))
    out_path = f"results/ESD_PVC_24k/{key}_converted.wav"
    sf.write(out_path, wave, 24000)
    print(f"Saved: {out_path}")
    print('Reference (vocoder): %s' % key)
    if reconstructed_samples[key] is not None:
        display(ipd.Audio(reconstructed_samples[key], rate=24000))

print('Original (vocoder):')
wave, sr = librosa.load(wav_path, sr=24000)
mel = preprocess(wave)
c = mel.transpose(-1, -2).squeeze().to('cuda')
with torch.no_grad():
    recon = vocoder.inference(c)
    recon = recon.view(-1).cpu().numpy()
display(ipd.Audio(recon, rate=24000))
print('Original:')
display(ipd.Audio(wav_path, rate=24000))

total processing time: 1.654 sec
Converted: 16_0


Saved: results/ESD_PVC_24k/16_0_converted.wav
Reference (vocoder): 16_0


Converted: 16_1


Saved: results/ESD_PVC_24k/16_1_converted.wav
Reference (vocoder): 16_1


Converted: 16_2


Saved: results/ESD_PVC_24k/16_2_converted.wav
Reference (vocoder): 16_2


Converted: 16_3


Saved: results/ESD_PVC_24k/16_3_converted.wav
Reference (vocoder): 16_3


Converted: 16_4


Saved: results/ESD_PVC_24k/16_4_converted.wav
Reference (vocoder): 16_4


Converted: 16_5


Saved: results/ESD_PVC_24k/16_5_converted.wav
Reference (vocoder): 16_5


Converted: 16_6


Saved: results/ESD_PVC_24k/16_6_converted.wav
Reference (vocoder): 16_6


Converted: 16_7


Saved: results/ESD_PVC_24k/16_7_converted.wav
Reference (vocoder): 16_7


Converted: 16_8


Saved: results/ESD_PVC_24k/16_8_converted.wav
Reference (vocoder): 16_8


Converted: 16_9


Saved: results/ESD_PVC_24k/16_9_converted.wav
Reference (vocoder): 16_9


Converted: 16_10


Saved: results/ESD_PVC_24k/16_10_converted.wav
Reference (vocoder): 16_10


Converted: 16_11


Saved: results/ESD_PVC_24k/16_11_converted.wav
Reference (vocoder): 16_11


Converted: 16_12


Saved: results/ESD_PVC_24k/16_12_converted.wav
Reference (vocoder): 16_12


Converted: 16_13


Saved: results/ESD_PVC_24k/16_13_converted.wav
Reference (vocoder): 16_13


Converted: 16_14


Saved: results/ESD_PVC_24k/16_14_converted.wav
Reference (vocoder): 16_14


Converted: 16_15


Saved: results/ESD_PVC_24k/16_15_converted.wav
Reference (vocoder): 16_15


Original (vocoder):


Original:


#### Convert by mapping network

In [12]:
# no reference, using mapping network
speaker_dicts = {}
selected_speakers = [11, 13, 14, 15, 16, 18, 19, 20]
for s in selected_speakers:
    k = s
    speaker_dicts[str(s)] = ('', speakers.index(s))

reference_embeddings = compute_style(speaker_dicts)

In [None]:
# conversion 
import time
start = time.time()
    
source = preprocess(audio).to('cuda:0')
keys = []
converted_samples = {}
reconstructed_samples = {}
converted_mels = {}

for key, (ref, _) in reference_embeddings.items():
    with torch.no_grad():
        f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
        out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
        
        c = out.transpose(-1, -2).squeeze().to('cuda')
        y_out = vocoder.inference(c)
        y_out = y_out.view(-1).cpu()

        if key not in speaker_dicts or speaker_dicts[key][0] == "":
            recon = None
        else:
            wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
            mel = preprocess(wave)
            c = mel.transpose(-1, -2).squeeze().to('cuda')
            recon = vocoder.inference(c)
            recon = recon.view(-1).cpu().numpy()

    converted_samples[key] = y_out.numpy()
    reconstructed_samples[key] = recon

    converted_mels[key] = out
    
    keys.append(key)
end = time.time()
print('total processing time: %.3f sec' % (end - start) )

import IPython.display as ipd
for key, wave in converted_samples.items():
    print('Converted: %s' % key)
    display(ipd.Audio(wave, rate=24000))
    out_path = f"results/ESD__PVC_24k/{key}_converted_MN.wav"
    sf.write(out_path, wave, 24000)
    print(f"Saved: {out_path}")
    print('Reference (vocoder): %s' % key)
    if reconstructed_samples[key] is not None:
        display(ipd.Audio(reconstructed_samples[key], rate=24000))

print('Original (vocoder):')
wave, sr = librosa.load(wav_path, sr=24000)
mel = preprocess(wave)
c = mel.transpose(-1, -2).squeeze().to('cuda')
with torch.no_grad():
    recon = vocoder.inference(c)
    recon = recon.view(-1).cpu().numpy()
display(ipd.Audio(recon, rate=24000))
print('Original:')
display(ipd.Audio(wav_path, rate=24000))