In [None]:
#@markdown # Check GPU type
#@markdown ### Factory reset runtime if you don't have the desired GPU.

#@markdown ---

#@markdown ## It is recommended to not use the K80

!nvidia-smi -L
#@markdown All GPUs work properly, but vary in speed. K80 and P4 are not recommended.

#@markdown ---

In [None]:
#@markdown # Anti-Disconnect for Google Colab
#@markdown ## Run this to stop it from disconnecting automatically (will disconnect after 4+ hours, though.)

import IPython
js_code = '''
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}
setInterval(ClickConnect,60000)
'''
display(IPython.display.Javascript(js_code))

In [None]:
#@markdown ## Mount your Google Drive

#Google Drive Authentication Token
from google.colab import drive
drive.mount('drive')

In [None]:
#@markdown # Download pretrained models and install tacotron 2

#make wavs folder
!mkdir '/content/wavs'

#get ntk japanese pretrained model
!gdown 1-5ULOICIs_BOndoqlVFB0BMjmvwiqGvE
#get french pretrained model
!gdown 1--lPwGhqFkqFZrd04Qhm90ndrepXifCf
#get talqu pretrained model
!gdown 1j986QrB1C-tY4GLq806xMBfMWVO3YKY8
#get mandarin pretrained model
!gdown 1lavjPjHtYAoe4qqOralsK9doCKIxd9s5

#download tacotron 2
!git clone -q https://github.com/NVIDIA/tacotron2
!pip install unidecode
!pip install tensorflow==1.15

In [None]:
#@markdown # Optional: Unzip file to unpack wavs
#@markdown ### If you have a lot of wav files, then zip them all into one file locally on your system, then upload it and copy the path. Otherwise, you may just upload your wavs to wavs/.
#@markdown ---

zip_file_path = "/content/cleaned.zip" #@param {type:"string"}
!unzip $zip_file_path -d '/content/wavs'

In [None]:
#@markdown # Patch cleaners.py

%%writefile /content/tacotron2/text/cleaners.py
""" from https://github.com/keithito/tacotron """

'''
Cleaners are transformations that run over the input text at both training and eval time.

Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  1. "english_cleaners" for English text
  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
     the symbols in symbols.py to match your data).
'''

import re
from unidecode import unidecode
from .numbers import normalize_numbers


# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')

# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
  ('mrs', 'misess'),
  ('mr', 'mister'),
  ('dr', 'doctor'),
  ('st', 'saint'),
  ('co', 'company'),
  ('jr', 'junior'),
  ('maj', 'major'),
  ('gen', 'general'),
  ('drs', 'doctors'),
  ('rev', 'reverend'),
  ('lt', 'lieutenant'),
  ('hon', 'honorable'),
  ('sgt', 'sergeant'),
  ('capt', 'captain'),
  ('esq', 'esquire'),
  ('ltd', 'limited'),
  ('col', 'colonel'),
  ('ft', 'fort'),
]]


def expand_abbreviations(text):
  for regex, replacement in _abbreviations:
    text = re.sub(regex, replacement, text)
  return text


def expand_numbers(text):
  return normalize_numbers(text)


def lowercase(text):
  return text.lower()


def collapse_whitespace(text):
  return re.sub(_whitespace_re, ' ', text)


def convert_to_ascii(text):
  return unidecode(text)


def basic_cleaners(text):
  '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
  text = lowercase(text)
  text = collapse_whitespace(text)
  return text


def transliteration_cleaners(text):
  '''Pipeline for non-English text that transliterates to ASCII.'''
  text = convert_to_ascii(text)
  text = lowercase(text)
  text = collapse_whitespace(text)
  return text


def english_cleaners(text):
  '''Pipeline for English text, including number and abbreviation expansion.'''
  text = convert_to_ascii(text)
  text = lowercase(text)
  text = expand_numbers(text)
  text = expand_abbreviations(text)
  text = collapse_whitespace(text)
  return text

def return_text(text):
  return text

In [None]:
#@markdown # Patch symbols.py

%%writefile /content/tacotron2/text/symbols.py
""" from https://github.com/keithito/tacotron """

'''
Defines the set of symbols used in text input to the model.

The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
from text import cmudict

_pad        = '_'
_punctuation = '!\'(),.:;? «»'
_special = '-'
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"

# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in cmudict.valid_symbols]

# Export all symbols:
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet


In [None]:
#@markdown This is for your training configuration (hparams.py)

%%writefile /content/tacotron2/hparams.py
import tensorflow as tf
from text import symbols

transcription='/content/list_ntk.txt'#@param {type:'string'}
batchsize=2#@param {type: 'integer'}

def create_hparams(hparams_string=None, verbose=False):
    """Create model hyperparameters. Parse nondefault from given string."""

    hparams = tf.contrib.training.HParams(
        ################################
        # Experiment Parameters        #
        ################################
        epochs=5000,
        iters_per_checkpoint=100,
        seed=1234,
        dynamic_loss_scaling=True,
        fp16_run=False,
        distributed_run=False,
        dist_backend="nccl",
        dist_url="tcp://localhost:54321",
        cudnn_enabled=True,
        cudnn_benchmark=False,
        ignore_layers=['embedding.weight'],

        ################################
        # Data Parameters             #
        ################################
        load_mel_from_disk=False,
        training_files=transcription,
        validation_files=transcription,
        text_cleaners=['return_text'],

        ################################
        # Audio Parameters             #
        ################################
        max_wav_value=32768.0,
        sampling_rate=22050,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        n_mel_channels=80,
        mel_fmin=0.0,
        mel_fmax=8000.0,

        ################################
        # Model Parameters             #
        ################################
        n_symbols=len(symbols),
        symbols_embedding_dim=512,

        # Encoder parameters
        encoder_kernel_size=5,
        encoder_n_convolutions=3,
        encoder_embedding_dim=512,

        # Decoder parameters
        n_frames_per_step=1,  # currently only 1 is supported
        decoder_rnn_dim=1024,
        prenet_dim=256,
        max_decoder_steps=1000,
        gate_threshold=0.5,
        p_attention_dropout=0.1,
        p_decoder_dropout=0.1,

        # Attention parameters
        attention_rnn_dim=1024,
        attention_dim=128,

        # Location Layer parameters
        attention_location_n_filters=32,
        attention_location_kernel_size=31,

        # Mel-post processing network parameters
        postnet_embedding_dim=512,
        postnet_kernel_size=5,
        postnet_n_convolutions=5,

        ################################
        # Optimization Hyperparameters #
        ################################
        use_saved_learning_rate=False,
        learning_rate=1e-3,
        weight_decay=1e-6,
        grad_clip_thresh=1.0,
        batch_size=batchsize, #if you have the T4, set this to 14 or less
        mask_padding=True  # set model's padded outputs to padded values
    )

    if hparams_string:
        tf.logging.info('Parsing command line hparams: %s', hparams_string)
        hparams.parse(hparams_string)

    if verbose:
        tf.logging.info('Final parsed hparams: %s', hparams.values())

    return hparams

In [None]:
#@markdown # Begin training
#@markdown neuTalk_Japanese_pretrained.pt = neuTalk-formatted Japanese data \
#@markdown FlatBaseModel_frontVoiceIsAkitoTenohira_20210418.pt = TALQu-formatted Japanese data \
#@markdown neuTalk_French-IPA_pretrained.pt = French data transcribed in IPA \
#@markdown neuTalk_Mandarin_pretrained.pt = Mandarin data transcribed with the appropriate g2p

warm_start_model = "neuTalk_Japanese_pretrained.pt" #@param ["neuTalk_Japanese_pretrained.pt", "neuTalk_French-IPA_pretrained.pt", "FlatBaseModel_frontVoiceIsAkitoTenohira_20210418.pt", "neuTalk_Mandarin_pretrained.pt"]
output_directory = "/content/drive/MyDrive/jsut_loanwords128" #@param{type:'string'}

!python /content/tacotron2/train.py --log_directory='/content/logs' -c $warm_start_model --warm_start --output_directory=$output_directory