## Import

In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

In [2]:
# Import of basic packages
import numpy as np
import pandas as pd
import warnings
from joblib import dump, load
import torch
import os
import os
from datetime import datetime

warnings.filterwarnings('ignore')

# Set random seed 
RSEED = 0

## Configurations

#### Dataset Config

In [3]:
output_path = ""

dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="transcript.txt", path=os.path.join(output_path, "NewTTSDataset/")
)

#### Audio Config

In [4]:
audio_config = VitsAudioConfig(
    sample_rate=44100, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)


#### Characters Config

In [5]:
character_config = CharactersConfig(
    characters_class= "TTS.tts.models.vits.VitsCharacters",
    characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890",
    punctuations=" !,.?-",
    pad= "<PAD>",
    eos= "<EOS>",
    bos= "<BOS>",
    blank= "<BLNK>",
)


#### Model Config

In [6]:
config = VitsConfig(
    audio=audio_config,
    characters=character_config,
    run_name="logs",
    batch_size=16,
    eval_batch_size=4,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=0,
    epochs=1000,
    text_cleaner="basic_cleaners",
    use_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=False,
    save_best_after=1000,
    save_checkpoints=True,
    save_all_best=True,
    mixed_precision=True,
    max_text_len=250,  # change this if you have a larger VRAM than 16GB
    output_path=output_path,
    datasets=[dataset_config],
    cudnn_benchmark=False,
    test_sentences=[
        ["I live in Berlin."],
        ["Today is a sunny day."],
        ["This is my voice speaking."]
    ]
)


## Tokenizer

In [7]:
# Audio processor is used for feature extraction and audio I/O.
ap = AudioProcessor.init_from_config(config)

# Tokenizer is used to convert text to sequences of token IDs
tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:44100
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


#### Formatter

In [8]:

def formatter(root_path, manifest_file, **kwargs): 

    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "my_speaker"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = f"wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items

Sampole data taken from: https://github.com/coqui-ai/coqui-voice-pack/releases/tag/v2
<br>Formular for creating the transcript: https://docs.google.com/spreadsheets/d/1aJaZdtrtTm3aBEFJOk-8h0vvtiZgryIO8WoY9L60Sdc/edit?usp=sharing

In [9]:
train_samples, eval_samples = load_tts_samples(
dataset_config, 
eval_split=True, 
formatter=formatter,
eval_split_size=0.013  # Adjusted eval_split_size / reverse later
)

 | > Found 77 files in D:\Github\children-stories\text-to-audio\NewTTSDataset


## Training

In [10]:
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

# initialize  trainer with the updated config
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)


 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Num. of CPUs: 16
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=logs-July-25-2024_03+59PM-d013e1a

 > Model has 83047468 parameters


#### Save model

In [11]:
model_path = '/model'

# Save the model
torch.save(model.state_dict(), model_path)

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/1000[0m
 --> logs-July-25-2024_03+31PM-d013e1a




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 76


PermissionError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: 'd:/Github/children-stories/text-to-audio/logs-July-25-2024_03+31PM-d013e1a\\trainer_0_log.txt'