- 上から順に実行してください
- ファイルのアップロードは左メニューのフォルダのアイコンから行えます

## 1. 音声ファイルの分割

- 全ての音声ファイルに対して行ってください
- すでに音声ファイルが分割されている場合は不要です
- 複数者の音声が含まれている場合、こちらの実行後に話者ごとに音声ファイルを分けてください

In [None]:
# Googleドライブに接続
from google.colab import drive
drive.mount('drive')

In [None]:
!pip install inaSpeechSegmenter pydub

In [None]:
# ---
# 設定パラメータ（※実行前に設定してください）

# 入力のwavファイルのパス
# Googleドライブ内のファイルを指定したい場合は「drive/My Drive/～」としてください
# あるいは、ファイルをランタイムにアップロードして、そのファイル名を指定してください
input_file = 'test.wav'

# 出力のwavファイルのフォルダとプレフィックスまで指定
# segment_output/segment000000000.wav、segment_output/test_000000001.wav、のような出力を想定
output_file = 'drive/My Drive/segment_output/test_'

# ---

# 参考：https://tam5917.hatenablog.com/entry/2020/01/25/132113

from inaSpeechSegmenter import Segmenter
from pydub import AudioSegment

seg = Segmenter(vad_engine='smn', detect_gender=False)

segmentation = seg(input_file)

speech_segment_index = 0
for segment in segmentation:
    segment_label = segment[0]

    if (segment_label == 'speech'):

        start_time = segment[1] * 1000
        end_time = segment[2] * 1000

        newAudio = AudioSegment.from_wav(input_file)
        newAudio = newAudio[start_time:end_time]
        new_output_file = '{}{:09}.wav'.format(output_file, speech_segment_index)
        newAudio.export(new_output_file, format="wav")

        speech_segment_index += 1
        del newAudio

## 2. 音声の分析処理

In [None]:
# Googleドライブに接続
from google.colab import drive
drive.mount('drive')

In [None]:
!pip install pysptk pyworld

In [None]:
# 解凍が必要な場合（****.zipを対象ファイル名にしてから実行）
#!unzip ****.zip -d inputs_wav

In [None]:
# ---
# 設定パラメータ（※実行前に設定してください）

# 入力の音声ファイルのディレクトリのパス
# Googleドライブ内のディレクトリを指定したい場合は「drive/My Drive/～」としてください
# あるいは、ファイルをランタイムにアップロードして、そのディレクトリ名を指定してください
# アップロードの際はzipフォルダにして、それをunzipする方法がおすすめです
input_voices_dir = 'inputs_wav'

# 音声ファイルの拡張子
ext = '.wav'

# 出力zipファイル
# 「.zip」は省略可
output_zip_file = 'drive/My Drive/targets'

# 中間ディレクトリ
# 通常は変更不要です
# 処理に時間がかかる場合は、Googleドライブ内のディレクトリを指定すると、途中までのファイルが保存され、途中から再開することができるようになります
gen_dir_name = 'inputs'

# ---

import os
import glob
import shutil
import subprocess
import struct
import pyworld
import numpy as np
import pysptk
import librosa

input_voices = glob.glob(input_voices_dir + '/**/*' + ext, recursive=True)

if os.path.isdir(gen_dir_name) == False:
    os.mkdir(gen_dir_name)

for input_voice_index, input_voice in enumerate(input_voices):
    print('\r音声ファイルの解析中...(' + str(input_voice_index) + '/' + str(len(input_voices)) + ')', end='')

    fs = 24000
    input_x, _ = librosa.load(input_voice, sr=fs)

    for cut in [0, 60]:
        for reverse in [False, True]:
            name = os.path.basename(input_voice).replace('.', '__') + '_{}_{}'.format(cut, reverse)
            if os.path.isfile(gen_dir_name + '/' + name + '.mc'):
                continue

            x = np.array(input_x[cut:], dtype='float64')
            if reverse:
                x *= -1
          
            f0, t = pyworld.harvest(
                x,
                fs,
                frame_period=5.0,
                f0_floor=71.0,
                f0_ceil=800.0,
            )
            f0 = pyworld.stonemask(x, f0, t, fs)
            
            pitch = [0.0 for _ in range(f0.shape[0])]
            for loop in range(f0.shape[0]):
                if f0[loop] >= 71.0:
                    pitch[loop] = fs / f0[loop]
                else:
                    pitch[loop] = 0.0

            write_file = open(gen_dir_name + '/' + name + '.pitch', 'wb')
            write_file.write(struct.pack('<' + str(len(pitch)) + 'f', *pitch))
            write_file.close()

            sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=1024)
            
            alpha = pysptk.util.mcepalpha(fs)
            mc = pysptk.sp2mc(sp, order=32, alpha=alpha)
            
            write_file = open(gen_dir_name + '/' + name + '.mc', 'wb')
            for mc_inner in mc:
                write_file.write(struct.pack('<' + str(len(mc_inner)) + 'f', *mc_inner))
            write_file.close()

print('\r音声ファイルの解析完了')

print('圧縮中...', end='')
if output_zip_file[-4:] == '.zip':
    output_zip_file = output_zip_file[:-4]
shutil.make_archive(output_zip_file, 'zip', root_dir=gen_dir_name)
print('\r圧縮完了')

shutil.rmtree(gen_dir_name)

## 3. 音源の学習処理

- **[こちら](nvc_train_v4_tpu.ipynb)を開いて、そのコードを実行してくだい**
- この処理は自動では終了しません。また、再度実行すると途中から再開することができます。「4.」で確認して適宜実行や中断を行ってください

## 4. テスト再生

In [None]:
# Googleドライブに接続
from google.colab import drive
drive.mount('drive')

In [None]:
!pip install pysptk pyworld

!git clone --depth 1 "https://github.com/NON906/nvc_train_v4.git"

In [None]:
# ---
# 設定パラメータ（※実行前に設定してください）

# 変換元の音声ファイル
input_voices = ['nvc_train_v4/test.wav']

# 入力音源ファイル
model_path = 'drive/My Drive/nvc/gen_000000xx0.h5'

# ---

%tensorflow_version 2.x
import pyworld
import numpy as np
import pysptk
import librosa
import IPython.display
from tensorflow.keras.layers import Dense, LSTM, Lambda, Input, Concatenate
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model

phoneme_model_path = 'nvc_train_v4/phoneme.h5'

input_layer = Input(shape=(None, 34), name='phoneme_input')

f_layers = LSTM(128, return_sequences=True, name='phoneme_lstm0')(input_layer)
f_layers = LSTM(128, return_sequences=True, name='phoneme_lstm1')(f_layers)
loop_layers = []
for loop in range(64):
    f_layers_loop = Dense(2, name='phoneme_dense_l' + str(loop))(f_layers)
    f_layers_loop = Lambda(lambda x: K.l2_normalize(x, axis=-1), name='phoneme_norm_l' + str(loop))(f_layers_loop)
    loop_layers.append(f_layers_loop)
phoneme_layers = Concatenate(name='phoneme_concat')(loop_layers)

gen_layers_pitch = LSTM(64, return_sequences=True, name='gen_pitch_lstm0')(phoneme_layers)
gen_layers_pitch = LSTM(64, return_sequences=True, name='gen_pitch_lstm1')(gen_layers_pitch)
gen_layers_pitch = Dense(1, name='gen_pitch_dense')(gen_layers_pitch)

concat_layers = Concatenate(name='gen_concat_0')([phoneme_layers, gen_layers_pitch])

gen_layers_power = LSTM(64, return_sequences=True, name='gen_power_lstm0')(phoneme_layers)
gen_layers_power = LSTM(64, return_sequences=True, name='gen_power_lstm1')(gen_layers_power)
gen_layers_power = Dense(1, name='gen_power_dense')(gen_layers_power)

gen_layers = LSTM(128, return_sequences=True, name='gen_lstm0')(concat_layers)
gen_layers = LSTM(128, return_sequences=True, name='gen_lstm1')(gen_layers)
gen_layers = Dense(32, name='gen_dense')(gen_layers)

gen_layers = Concatenate(name='gen_concat_1')([gen_layers_power, gen_layers, gen_layers_pitch])

model = Model(inputs=input_layer, outputs=gen_layers, name='model')
model.load_weights(phoneme_model_path, by_name=True)
model.load_weights(model_path, by_name=True)

for input_voice in input_voices:
    fs = 24000
    x, _ = librosa.load(input_voice, sr=fs)
    x = np.array(x, dtype='float64')

    f0, t = pyworld.harvest(
        x,
        fs,
        frame_period=5.0,
        f0_floor=71.0,
        f0_ceil=800.0,
    )
    f0 = pyworld.stonemask(x, f0, t, fs)

    pitch = [0.0 for _ in range(f0.shape[0])]
    for loop in range(f0.shape[0]):
        if f0[loop] >= 71.0:
            pitch[loop] = fs / f0[loop]
        else:
            pitch[loop] = 0.0
        pitch[loop] = pitch[loop] / (fs / 71.0) * 4.0
    pitch = np.array(pitch, dtype='float64')

    sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=1024)

    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(sp, order=32, alpha=alpha)

    ap = pyworld.d4c(x, f0, t, fs, fft_size=1024)

    input = np.concatenate([mc, pitch[..., np.newaxis]], axis=-1)[np.newaxis, ...]

    result = model.predict(input)

    synth_f0 = [0.0 for _ in range(f0.shape[0])]
    for loop in range(f0.shape[0]):
        if result[0, loop, -1] > 0.0:
            pitch_result = result[0, loop, -1] * (fs / 71.0) / 4.0
            synth_f0[loop] = fs / pitch_result
            if synth_f0[loop] > 800.0 or synth_f0[loop] < 71.0:
                synth_f0[loop] = 0.0
        else:
            synth_f0[loop] = 0.0
    synth_f0 = np.array(synth_f0, dtype='float64')

    synth_sp = pysptk.mc2sp(result[0, :, :-1], alpha, fftlen=1024)
    synth_sp = np.array(synth_sp, dtype='float64')

    synthesized = pyworld.synthesize(synth_f0, synth_sp, ap, fs)
    
    print(input_voice + ':')
    display(IPython.display.Audio(data=synthesized, rate=fs))

K.clear_session()

## 5. nvc4ファイルの作成

In [None]:
# Googleドライブに接続
from google.colab import drive
drive.mount('drive')

In [None]:
# ---
# 設定パラメータ（※実行前に設定してください）

# 入力の解析済みファイル
targets_zip_file = 'drive/My Drive/targets.zip'

# ---

!unzip "{targets_zip_file}" -d targets > /dev/null

In [None]:
# ---
# 設定パラメータ（※実行前に設定してください）

# 入力音源ファイル
input_file = 'drive/My Drive/nvc/gen_000000xx0.h5'

# 作成する音源の名前
nvc_name = 'サンプル'

# 出力ファイル
# 「.nvz4」は不要
output_nvc_file = 'drive/My Drive/nvc/target'

# ---


%tensorflow_version 2.x
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Dense, LSTM, Input, Concatenate, Lambda, Activation, Reshape
from tensorflow.keras.regularizers import l1
from tensorflow.keras import backend as K
import tensorflow as tf
from statistics import mean, variance
import json
import shutil
import uuid
import os
import glob
import struct


# ピッチの計算
pitch_list = []
input_voices = glob.glob('targets/**/*_0_1.00_1.00_False.pitch', recursive=True)
for input_voice in input_voices:
    read_data = open(input_voice, 'rb').read()
    read_array = struct.unpack('<' + str(len(read_data) // 4) + 'f', read_data)
    for val in read_array:
        if val > 0.0:
            pitch_list.append(val)
pitch_mean = mean(pitch_list)
pitch_variance = variance(pitch_list)


# jsonファイルの作成
dict_for_json = {
    'version' : 1,
    'name' : nvc_name,
    'uuid' : str(uuid.uuid4()),
    'pitch_mean' : pitch_mean,
    'pitch_variance' : pitch_variance,
}
os.makedirs('tmp', exist_ok=True)
_ = open('tmp/nvc.json', 'w').write(json.dumps(dict_for_json))


# tfliteモデルの作成
frame_length = 16

# gen
output_file = 'tmp/nvc_gen.tflite'

input_layer = Input(batch_shape=(1, frame_length, 129), name='gen_input')

rec_input = [Input(batch_shape=(1, 128), name='gen_rec_input_0'),
             Input(batch_shape=(1, 128), name='gen_rec_input_1'),
             Input(batch_shape=(1, 128), name='gen_rec_input_2'),
             Input(batch_shape=(1, 128), name='gen_rec_input_3')]

rec_output = [None for _ in range(4)]

gen_layers, rec_output[0], rec_output[1] = LSTM(128, return_sequences=True, return_state=True, unroll=True, name='gen_lstm0')(input_layer, initial_state=[rec_input[0], rec_input[1]])
gen_layers, rec_output[2], rec_output[3] = LSTM(128, return_sequences=True, return_state=True, unroll=True, name='gen_lstm1')(gen_layers, initial_state=[rec_input[2], rec_input[3]])
gen_layers = Dense(32, name='gen_dense')(gen_layers)

inputs_list = [input_layer] + rec_input
outputs_list = [gen_layers] + rec_output
model = Model(inputs=inputs_list, outputs=outputs_list, name='gen_model')

model.load_weights(input_file, by_name=True)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
_ = open(output_file, 'wb').write(tflite_model)


# pitch
output_file = 'tmp/nvc_gen_pitch.tflite'

input_layer = Input(batch_shape=(1, frame_length, 128), name='gen_input')

rec_input = [Input(batch_shape=(1, 64), name='gen_rec_input_0'),
             Input(batch_shape=(1, 64), name='gen_rec_input_1'),
             Input(batch_shape=(1, 64), name='gen_rec_input_2'),
             Input(batch_shape=(1, 64), name='gen_rec_input_3')]

rec_output = [None for _ in range(4)]

gen_layers, rec_output[0], rec_output[1] = LSTM(64, return_sequences=True, return_state=True, unroll=True, name='gen_pitch_lstm0')(input_layer, initial_state=[rec_input[0], rec_input[1]])
gen_layers, rec_output[2], rec_output[3] = LSTM(64, return_sequences=True, return_state=True, unroll=True, name='gen_pitch_lstm1')(gen_layers, initial_state=[rec_input[2], rec_input[3]])
gen_layers = Dense(1, name='gen_pitch_dense')(gen_layers)

inputs_list = [input_layer] + rec_input
outputs_list = [gen_layers] + rec_output
model = Model(inputs=inputs_list, outputs=outputs_list, name='gen_pitch_model')

model.load_weights(input_file, by_name=True)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
_ = open(output_file, 'wb').write(tflite_model)


# power
output_file = 'tmp/nvc_gen_power.tflite'

input_layer = Input(batch_shape=(1, frame_length, 128), name='gen_input')

rec_input = [Input(batch_shape=(1, 64), name='gen_rec_input_0'),
             Input(batch_shape=(1, 64), name='gen_rec_input_1'),
             Input(batch_shape=(1, 64), name='gen_rec_input_2'),
             Input(batch_shape=(1, 64), name='gen_rec_input_3')]

rec_output = [None for _ in range(4)]

gen_layers, rec_output[0], rec_output[1] = LSTM(64, return_sequences=True, return_state=True, unroll=True, name='gen_power_lstm0')(input_layer, initial_state=[rec_input[0], rec_input[1]])
gen_layers, rec_output[2], rec_output[3] = LSTM(64, return_sequences=True, return_state=True, unroll=True, name='gen_power_lstm1')(gen_layers, initial_state=[rec_input[2], rec_input[3]])
gen_layers = Dense(1, name='gen_power_dense')(gen_layers)

inputs_list = [input_layer] + rec_input
outputs_list = [gen_layers] + rec_output
model = Model(inputs=inputs_list, outputs=outputs_list, name='gen_power_model')

model.load_weights(input_file, by_name=True)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
_ = open(output_file, 'wb').write(tflite_model)


# ファイルをまとめる
if output_nvc_file[-5:] == '.nvz4':
    output_nvc_file = output_nvc_file[:-5]
shutil.make_archive(output_nvc_file, 'zip', root_dir='tmp')
os.rename(output_nvc_file + '.zip', output_nvc_file + '.nvz4')