-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into multi_adapter
- Loading branch information
Showing
1 changed file
with
247 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,247 @@ | ||
# This config contains the default values for training FastPitch model with aligner using 44.1KHz sampling | ||
# rate. If you want to train model on other dataset, you can change config values according to your dataset. | ||
# Most dataset-specific arguments are in the head of the config file, see below. | ||
|
||
|
||
name: FastPitch | ||
|
||
train_dataset: ??? | ||
validation_datasets: ??? | ||
sup_data_path: ??? | ||
sup_data_types: [ "align_prior_matrix", "pitch" ] | ||
|
||
# Default values from librosa.pyin | ||
pitch_fmin: 65.40639132514966 | ||
pitch_fmax: 2093.004522404789 | ||
|
||
# Stats (per frame), train (these values depend on pitch_fmin and pitch_fmax) | ||
pitch_mean: 212.35873413085938 | ||
pitch_std: 68.52806091308594 | ||
|
||
sample_rate: 44100 | ||
n_mel_channels: 80 | ||
n_window_size: 2048 | ||
n_window_stride: 512 | ||
n_fft: 2048 | ||
lowfreq: 0 | ||
highfreq: 8000 | ||
window: hann | ||
|
||
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.01" | ||
heteronyms_path: "scripts/tts_dataset_files/heteronyms-030921" | ||
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv" | ||
|
||
model: | ||
learn_alignment: true | ||
bin_loss_warmup_epochs: 100 | ||
|
||
n_speakers: 1 | ||
max_token_duration: 75 | ||
symbols_embedding_dim: 384 | ||
pitch_embedding_kernel_size: 3 | ||
|
||
pitch_fmin: ${pitch_fmin} | ||
pitch_fmax: ${pitch_fmax} | ||
|
||
pitch_mean: ${pitch_mean} | ||
pitch_std: ${pitch_std} | ||
|
||
sample_rate: ${sample_rate} | ||
n_mel_channels: ${n_mel_channels} | ||
n_window_size: ${n_window_size} | ||
n_window_stride: ${n_window_stride} | ||
n_fft: ${n_fft} | ||
lowfreq: ${lowfreq} | ||
highfreq: ${highfreq} | ||
window: ${window} | ||
|
||
text_normalizer: | ||
_target_: nemo_text_processing.text_normalization.normalize.Normalizer | ||
lang: en | ||
input_case: cased | ||
whitelist: ${whitelist_path} | ||
|
||
text_normalizer_call_kwargs: | ||
verbose: false | ||
punct_pre_process: true | ||
punct_post_process: true | ||
|
||
text_tokenizer: | ||
_target_: nemo.collections.tts.torch.tts_tokenizers.EnglishPhonemesTokenizer | ||
punct: true | ||
stresses: true | ||
chars: true | ||
apostrophe: true | ||
pad_with_space: true | ||
g2p: | ||
_target_: nemo.collections.tts.torch.g2ps.EnglishG2p | ||
phoneme_dict: ${phoneme_dict_path} | ||
heteronyms: ${heteronyms_path} | ||
phoneme_probability: 0.5 | ||
|
||
train_ds: | ||
dataset: | ||
_target_: nemo.collections.tts.torch.data.TTSDataset | ||
manifest_filepath: ${train_dataset} | ||
sample_rate: ${model.sample_rate} | ||
sup_data_path: ${sup_data_path} | ||
sup_data_types: ${sup_data_types} | ||
n_fft: ${model.n_fft} | ||
win_length: ${model.n_window_size} | ||
hop_length: ${model.n_window_stride} | ||
window: ${model.window} | ||
n_mels: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
max_duration: null | ||
min_duration: 0.1 | ||
ignore_file: null | ||
trim: false | ||
pitch_fmin: ${model.pitch_fmin} | ||
pitch_fmax: ${model.pitch_fmax} | ||
pitch_norm: true | ||
pitch_mean: ${model.pitch_mean} | ||
pitch_std: ${model.pitch_std} | ||
use_beta_binomial_interpolator: true | ||
dataloader_params: | ||
drop_last: false | ||
shuffle: true | ||
batch_size: 32 | ||
num_workers: 12 | ||
|
||
|
||
validation_ds: | ||
dataset: | ||
_target_: nemo.collections.tts.torch.data.TTSDataset | ||
manifest_filepath: ${validation_datasets} | ||
sample_rate: ${model.sample_rate} | ||
sup_data_path: ${sup_data_path} | ||
sup_data_types: ${sup_data_types} | ||
n_fft: ${model.n_fft} | ||
win_length: ${model.n_window_size} | ||
hop_length: ${model.n_window_stride} | ||
window: ${model.window} | ||
n_mels: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
max_duration: null | ||
min_duration: null | ||
ignore_file: null | ||
trim: false | ||
pitch_fmin: ${model.pitch_fmin} | ||
pitch_fmax: ${model.pitch_fmax} | ||
pitch_norm: true | ||
pitch_mean: ${model.pitch_mean} | ||
pitch_std: ${model.pitch_std} | ||
use_beta_binomial_interpolator: true | ||
|
||
dataloader_params: | ||
drop_last: false | ||
shuffle: false | ||
batch_size: 32 | ||
num_workers: 8 | ||
|
||
preprocessor: | ||
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | ||
features: ${model.n_mel_channels} | ||
lowfreq: ${model.lowfreq} | ||
highfreq: ${model.highfreq} | ||
n_fft: ${model.n_fft} | ||
n_window_size: ${model.n_window_size} | ||
window_size: false | ||
n_window_stride: ${model.n_window_stride} | ||
window_stride: false | ||
pad_to: 1 | ||
pad_value: 0 | ||
sample_rate: ${model.sample_rate} | ||
window: ${model.window} | ||
normalize: null | ||
preemph: null | ||
dither: 0.0 | ||
frame_splicing: 1 | ||
log: true | ||
log_zero_guard_type: add | ||
log_zero_guard_value: 1e-05 | ||
mag_power: 1.0 | ||
|
||
input_fft: #n_embed and padding_idx are added by the model | ||
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder | ||
n_layer: 6 | ||
n_head: 1 | ||
d_model: ${model.symbols_embedding_dim} | ||
d_head: 64 | ||
d_inner: 1536 | ||
kernel_size: 3 | ||
dropout: 0.1 | ||
dropatt: 0.1 | ||
dropemb: 0.0 | ||
d_embed: ${model.symbols_embedding_dim} | ||
|
||
output_fft: | ||
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder | ||
n_layer: 6 | ||
n_head: 1 | ||
d_model: ${model.symbols_embedding_dim} | ||
d_head: 64 | ||
d_inner: 1536 | ||
kernel_size: 3 | ||
dropout: 0.1 | ||
dropatt: 0.1 | ||
dropemb: 0.0 | ||
|
||
alignment_module: | ||
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder | ||
n_text_channels: ${model.symbols_embedding_dim} | ||
|
||
duration_predictor: | ||
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor | ||
input_size: ${model.symbols_embedding_dim} | ||
kernel_size: 3 | ||
filter_size: 256 | ||
dropout: 0.1 | ||
n_layers: 2 | ||
|
||
pitch_predictor: | ||
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor | ||
input_size: ${model.symbols_embedding_dim} | ||
kernel_size: 3 | ||
filter_size: 256 | ||
dropout: 0.1 | ||
n_layers: 2 | ||
|
||
optim: | ||
name: lamb | ||
lr: 1e-1 | ||
betas: [0.9, 0.98] | ||
weight_decay: 1e-6 | ||
|
||
sched: | ||
name: NoamAnnealing | ||
warmup_steps: 1000 | ||
last_epoch: -1 | ||
d_model: 1 # Disable scaling based on model dim | ||
|
||
trainer: | ||
num_nodes: 1 | ||
devices: -1 # number of gpus | ||
accelerator: gpu | ||
strategy: ddp | ||
precision: 16 | ||
max_epochs: 1500 | ||
accumulate_grad_batches: 1 | ||
gradient_clip_val: 1000.0 | ||
enable_checkpointing: False # Provided by exp_manager | ||
logger: False # Provided by exp_manager | ||
log_every_n_steps: 100 | ||
check_val_every_n_epoch: 5 | ||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_tensorboard_logger: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: v_loss | ||
resume_if_exists: false | ||
resume_ignore_no_checkpoint: false | ||
|