# Environment Configuration

In [None]:
!git clone https://github.com/1595258509/VITS_Japanese.git
%cd VITS_Japanese
%pip install -r requirements.txt
!sudo apt-get install espeak -y

In [None]:
%cd monotonic_align
!python setup.py build_ext --inplace
%cd ..

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Unpack dataset

In [None]:
api_token = {"username":"murlors","key":"71cb88bec124c1ef87c68f148fb7bb34"}
import json
import zipfile
import os
 
if not os.path.exists("/root/.kaggle"):
    os.makedirs("/root/.kaggle")
with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
!chmod 600 /root/.kaggle/kaggle.json

%cd /content/VITS_Japanese

!kaggle datasets download -d murlors/yosugaharuka --force

In [None]:
!unzip yosugaharuka.zip
!rm -rf yosugaharuka.zip

# Train

In [None]:
import torch
print(torch.__version__)

In [None]:
!/opt/bin/nvidia-smi

In [None]:
%load_ext tensorboard
%tensorboard --logdir=/content/drive/MyDrive/yosuga_base

In [None]:
!python train_ms.py -c configs/yosuga_base.json -m yosuga_base

# Text to Speech

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [None]:
hps_ms = utils.get_hparams_from_file("./configs/yosuga_base.json")
net_g_ms = SynthesizerTrn(
    len(symbols),
    hps_ms.data.filter_length // 2 + 1,
    hps_ms.train.segment_size // hps_ms.data.hop_length,
    n_speakers=hps_ms.data.n_speakers,
    **hps_ms.model).cuda()
_ = net_g_ms.eval()

_ = utils.load_checkpoint("/content/drive/MyDrive/yosuga_base/G_157000.pth", net_g_ms, None)

## Japanese

In [None]:
speaker_id = "0" #@param [0, 1, 2, 3, 4, 5, 6]
speaker_id = int(speaker_id)
text = 'だめだね、だめよ、だめなのよ' #@param {type: 'string'}
length_scale = 1.0 #@param {type:"slider", min:0.1, max:3, step:0.05}
sid = torch.LongTensor([speaker_id]).cuda()
stn_tst = get_text(text, hps_ms)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).cuda()
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps_ms.data.sampling_rate))

## Chinese(fake)

In [None]:
import json
import pypinyin

# pinyin-to-katakana map
map_path = 'text/py2kn.json'
with open(map_path, 'r', encoding='utf-8') as f:
  py2kn_map = json.load(f)

def pinyin2kana(text_ch):
  '''Pipeline for converting Chinese text to Japanese Romaji'''
  # Chinese characters to pinyin
  py_raw = pypinyin.pinyin(text_ch, style=pypinyin.NORMAL)
  pys = []
  for py in py_raw:
    py[0] = py[0].replace('\u200b', '') # 清除空字符
    pys.append(py[0])

  # katakana to romaji
  text_jp = ''
  for py in pys:
    text_jp += ''.join(py2kn_map[py])

  return text_jp


speaker_id_ch = "0" #@param [0, 1, 2, 3, 4, 5, 6]
speaker_id_ch = int(speaker_id_ch)

text_ch = '你好，我是春日野穹。' #@param {type: 'string'}
length_scale_ch = 1.0 #@param {type:"slider", min:0.1, max:3, step:0.05}

text_jp = pinyin2kana(text_ch)
sid_ch = torch.LongTensor([speaker_id_ch]).cuda()
stn_tst_ch = get_text(text_jp, hps_ms)

with torch.no_grad():
    x_tst_ch = stn_tst_ch.unsqueeze(0).cuda()
    x_tst_ch_lengths = torch.LongTensor([stn_tst_ch.size(0)]).cuda()
    audio_ch = net_g_ms.infer(x_tst_ch, x_tst_ch_lengths, sid=sid_ch, noise_scale=.667, noise_scale_w=0.8, length_scale=length_scale_ch)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio_ch, rate=hps_ms.data.sampling_rate))

## Voice conversion

In [None]:
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch

speaker_id = 0
audio, sampling_rate = load_wav_to_torch("./yosuga_wav/Kasugano_Sora/SR000023.wav")

y = audio / hps_ms.data.max_wav_value
y = y.unsqueeze(0).cuda()

spec = spectrogram_torch(y, hps_ms.data.filter_length,
    hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
    center=False)
spec_lengths = torch.LongTensor([spec.size(-1)]).cuda()
sid_src = torch.LongTensor([speaker_id]).cuda()

In [None]:
speaker_names = ['春日野穹','天女目瑛','依媛奈緒','渚一葉','乃木坂初佳','倉永梢','伊福部やひろ']
with torch.no_grad():
    sid_tgt = []
    audio = []
    for i, speaker_name in enumerate(speaker_names):
      sid_tgt.append(torch.LongTensor([i]).cuda())
      audio.append(net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt[i])[0][0,0].data.cpu().float().numpy())
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps_ms.data.sampling_rate))
for i, speaker_name in enumerate(speaker_names):
  if i == speaker_id:
    continue
  print("Converted SID: %d %s" % (sid_tgt[i].item(), speaker_name))
  ipd.display(ipd.Audio(audio[i], rate=hps_ms.data.sampling_rate))

## jtts

In [None]:
def jtts(text):
  speaker_id = "0" #@param [0, 1, 2, 3, 4, 5, 6]
  speaker_id = int(speaker_id)
  stn_tst = get_text(text, hps_ms)
  with torch.no_grad():
      x_tst = stn_tst.unsqueeze(0).cuda()
      x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
      sid = torch.LongTensor([speaker_id]).cuda()
      audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
  ipd.display(ipd.Audio(audio, rate=hps_ms.data.sampling_rate))

In [None]:
jtts("人類の偉大さは数字にこそあると言っていい。")
jtts("足し、引き、掛け、割り。")
jtts("そんな基本から、何かの大きさ、距離の長さ、")
jtts("全てを測定する。")
jtts("ありとあらゆる存在は数字なしではただの虚無なのだ。")
jtts("生は夏の花の如き、死は秋の叶の如く")

In [None]:
jtts("吾輩は猫である。名前はまだない")
jtts("試験勉強頑張ってくださいね")
jtts("私の処女をもらってください")
jtts("なんでこんなに慣れてんのよ。私の方が先に好きだったのに")
jtts("大変に気分がいい")
jtts("私わあやちねねです")
jtts("私のおなにいを見ないでください!")
jtts("それわどうかな")
jtts("授業中に出したら、学校生活終わるなり")