-
Notifications
You must be signed in to change notification settings - Fork 0
/
testing-1023sp.yaml
104 lines (90 loc) · 2.91 KB
/
testing-1023sp.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified by Myrtle.
# Note that, in general, modification of these config parameters outside the
# recommendations made in the training README may yield an RNN-T model that is
# incompatible with Myrtle's existing hardware accelerated inference server.
tokenizer:
sentpiece_model: /datasets/sentencepieces/SENTENCEPIECE.model
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
sampling: 0.05
# For validation we don't trim silence: its effect on WER is very small, it pushes WER on clean and noisy speech
# in different directions, it would be difficult to implement in a streaming system, and a VAD would be better.
input_val:
audio_dataset: &val_dataset
sample_rate: &sample_rate 16000
trim_silence: false
normalize_transcripts: true
standardize_wer: true
filterbank_features: &val_features
normalize: per_feature
sample_rate: *sample_rate
window_size: 0.02
window_stride: 0.01
window: hann
n_fft: 512
n_filt: &n_filt 80
dither: 0.00001
stats_path: /datasets/stats/STATS_SUBDIR
frame_splicing: &val_splicing
frame_stacking: 3
frame_subsampling: 3
# For training we trim silence, keep samples <= max_duration < max_transcript_len and apply augmentation
input_train:
audio_dataset:
<<: *val_dataset
trim_silence: true
max_duration: MAX_DURATION
max_transcript_len: 450
speed_perturbation:
min_rate: 0.85
max_rate: 1.15
p: 1.0
filterbank_features: *val_features
frame_splicing: *val_splicing
spec_augment:
freq_masks: 2
min_freq: 0
max_freq: 20
time_masks: 10
min_time: 0
max_time: 0.03
rnnt:
in_feats: 240 # n_filt x frame_stacking
enc_n_hid: 1024
enc_pre_rnn_layers: 2
enc_post_rnn_layers: 3
enc_stack_time_factor: 2
enc_dropout: 0.1
enc_batch_norm: false
enc_freeze: false
pred_n_hid: 512
pred_rnn_layers: 2
pred_dropout: 0.3
pred_batch_norm: false
joint_n_hid: 512
joint_dropout: 0.3
joint_net_lr_factor: 1.0
joint_apex_transducer: pack
joint_apex_relu_dropout: true
forget_gate_bias: 1.0
custom_lstm: true
quantize: false
enc_rw_dropout: 0.0
pred_rw_dropout: 0.0
grad_noise_scheduler:
noise_level: 0.0
decay_const: 0.55
start_step: 2000