In [15]:
import os

# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig

output_path = "tts_train_dir"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [21]:
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata_train.csv", path="../data/"
)

In [22]:
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.utils.text.
config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=100,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de-de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=1,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    save_step=1000,
)

In [23]:
from TTS.utils.audio import AudioProcessor
ap = AudioProcessor.init_from_config(config)
# Modify sample rate if for a custom audio dataset:
# ap.sample_rate = 22050

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [24]:
from TTS.tts.utils.text.tokenizer import TTSTokenizer
tokenizer, config = TTSTokenizer.init_from_config(config)

In [25]:
from TTS.tts.datasets import load_tts_samples
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

 | > Found 12283 files in J:\Text to Speech\data


In [26]:
from TTS.tts.models.glow_tts import GlowTTS
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

In [27]:
from trainer import Trainer, TrainerArgs
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

 > Training Environment:
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 > Start Tensorboard: tensorboard --logdir=tts_train_dir\run-May-19-2023_04+08PM-c81bf18

 > Model has 28610257 parameters


In [28]:
trainer.fit()


[4m[1m > EPOCH: 0/100[0m
 --> tts_train_dir\run-May-19-2023_04+08PM-c81bf18


[*] Pre-computing phonemes...


  0%|          | 1/12161 [00:00<58:23,  3.47it/s]

das zɔl nuːn nɔɔʏ̯ə ʔaoːfɡaːbən bəkɔmən.
 [!] Character '̯' not found in the vocabulary. Discarding it.


  0%|          | 5/12161 [00:01<42:45,  4.74it/s]  

zaɪ̯nə kaʁʁiːʁeː vaːɐ t͡suː ʔɛndeː, nɔx bəfoːɐ ziː bəɡɔnən hatə.
 [!] Character '͡' not found in the vocabulary. Discarding it.


  4%|▍         | 468/12161 [01:09<24:20,  8.01it/s]

ʊnt vas zɪnt diː ʃøːnstən“ ɛɐʔaɪ̯ɡnɪsə ɪm leːbən?
 [!] Character '“' not found in the vocabulary. Discarding it.


  7%|▋         | 905/12161 [02:04<23:10,  8.10it/s]

vaɪ̯naxtən — ʔatvɛntskalɛndɐ zɛlbɐ bastəln ɪst ʃøːn.
 [!] Character '—' not found in the vocabulary. Discarding it.


  8%|▊         | 914/12161 [02:05<23:31,  7.97it/s]

baɪ̯də ɡeːɡnɐ zɪnt nɔx ʔʊnɡəʃlaːɡən ʔɪn diːzəʁ bʊndɛsliːɡa zɛzõː.
 [!] Character 'õ' not found in the vocabulary. Discarding it.


  8%|▊         | 937/12161 [02:08<21:30,  8.70it/s]

ɡʁyːnəs lɪçt, nɪçt dʊŋkeːlɡʁyːn ʔoːdɐ oːʁãːʃ.
 [!] Character 'ã' not found in the vocabulary. Discarding it.


 16%|█▌        | 1894/12161 [03:57<21:27,  7.97it/s]


PermissionError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: 'J:/Text to Speech/notebooks/tts_train_dir/run-May-19-2023_04+08PM-c81bf18\\events.out.tfevents.1684505289.Kai-PC'

In [None]:
!pip install tensorboard
!tensorboard --logdir=tts_train_dir

In [None]:
import glob, os
output_path = "tts_train_dir"
ckpts = sorted([f for f in glob.glob(output_path+"/*/*.pth")])
configs = sorted([f for f in glob.glob(output_path+"/*/*.json")])

In [None]:
 !tts --text "Text for TTS" \
    --model_path $test_ckpt \
    --config_path $test_config \
    --out_path out.wav

In [None]:
import IPython
IPython.display.Audio("out.wav")