/
megatron_t5_config.yaml
170 lines (158 loc) · 7.77 KB
/
megatron_t5_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
name: megatron_t5
restore_from_path: null # used when starting from a .nemo file
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: 1000 # PTL default. In practice, max_steps will be reached first.
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
limit_val_batches: 50
limit_test_batches: 500
accumulate_grad_batches: 1
gradient_clip_val: 1.0
benchmark: False
exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_t5
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: 'megatron_t5--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
model:
# model parallelism
micro_batch_size: 4
global_batch_size: 8 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.
# model architecture
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
grad_allreduce_chunk_size_mb: 125
seq_length: 512
max_position_embeddings: ${.seq_length}
num_layers: 12 # For perceiver models, this is the number of cross-attention blocks. Each layer has 1 cross-attention and "num_self_attention_per_cross_attention" self-attention layers.
hidden_size: 768
ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
num_attention_heads: 12
init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
attention_dropout: 0.1 # Dropout probability in the attention layer.
position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'relative']
relative_attention_num_buckets: 32 # Relative position number of buckets for computing the bias
relative_attention_max_distance: 128 # max_distance to keep relative distance in the attention_num_buckets.
relative_position_bias_self_attention_only: True # Whether to only use relative position bias for self attention only.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
encoder_arch: 'transformer' # Options: ['transformer', 'perceiver']
decoder_arch: 'transformer' # Options: ['transformer']
activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu']
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
hidden_steps: 32 # Number of latent vectors to use for pereceiver encoders
num_self_attention_per_cross_attention: 1 # Number of self-attention layers for every cross-attention layer.
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
tokenizer:
library: 'megatron'
type: 'BertWordPieceCase'
model: null
vocab_file: null
merge_file: null
num_sentinel_tokens: 100
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
# not implemented in NeMo yet
activations_checkpoint_method: null # 'uniform', 'block'
activations_checkpoint_num_layers: 1
data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
# Or see example below:
# data_prefix:
# - .5
# - /raid/data/pile/my-t5_00_text_document
# - .5
# - /raid/data/pile/my-t5_01_text_document
data_prefix: ???
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap # mmap, retmmap, text_mmap, csv_mmap
# data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant)
# # defaults for text_memmap
# newline_int: 10 # byte-value of newline (Use ord('\n') to get value)
# header_lines: 0 # skip first N header lines
# workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
# sort_dataset_paths: False # if True datasets will be sorted by name
# # defaults for csv_memmap
# newline_int: 10 # byte-value of newline
# header_lines: 1 # skip first N header lines
# workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
# sort_dataset_paths: False # if True datasets will be sorted by name
# data_col: 1 # column to use for data
# data_sep: ',' # string to split text into columns
splits_string: 949,45,5
seq_length: ${model.seq_length}
seq_length_dec: 128
skip_warmup: True
num_workers: 0
dataloader_type: single # cyclic
masked_lm_prob: 0.15
dataset_type: 't5'
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: True
permutation: False
whole_word_masking: True
favor_longer_ngrams: False
respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of <pad> tokens within a batch.
optim:
name: fused_adam
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1e-8
weight_decay: 0.01
sched:
name: WarmupAnnealing
min_lr: 0.00001
last_epoch: -1
warmup_ratio: 0.01