/
based-360m.yaml
112 lines (95 loc) · 2.45 KB
/
based-360m.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# @package _global_
defaults:
- /experiment/pile/gpt3m-flash.yaml
- override /datamodule: wikitext103
train:
optimizer:
lr: 0.0008
betas: [0.9, 0.95]
_target_: apex.optimizers.FusedAdam
adam_w_mode: true
weight_decay: 0.1
scheduler:
lr_min: 0.00008
_target_: train.optim.timm_lr_scheduler.TimmCosineLRScheduler
warmup_t: 200
t_initial: 19800
t_in_epochs: false
warmup_prefix: true
warmup_lr_init: 0.000001
trainer:
# this interval is in terms of batch_idx not in terms of global_step, so we need
# to multiply by accumulate_grad_batches
val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
max_steps: 20000
datamodule:
batch_size: 8 # per gpu
batch_size_eval: 32
expt_name: 02-20-based-360m
name: ${.expt_name}
callbacks:
model_checkpoint:
dirpath: /var/cr01_data/sabri_data/checkpoints/${expt_name}
resume: True
do_test: True
model:
_target_: based.models.gpt.GPTLMHeadModel
_recursive_: false
config:
alt_mixer_layers:
- 1
- 6
- 11
- 16
- 21
alt_mixer_2_layers:
- 2
- 7
- 12
- 17
- 22
mixer:
_target_: based.models.mixers.convolution.BaseConv
l_max: ${....datamodule.max_length}
use_bias: True
expand_proj: 4
kernel_sizes: 3
alt_mixer:
_target_: based.models.mixers.linear_attention.LinearAttention
l_max: ${....datamodule.max_length}
feature_map:
_target_: based.models.mixers.linear_attention.TaylorExp
input_dim: ${..feature_dim}
feature_dim: 16
num_heads: 16
alt_mixer_2:
_target_: based.models.mixers.slide_attention.SlidingAttention
window_size: 128
num_heads: 16
causal: true
n_embd: 1024
special_initializer: true
n_head: 16
n_layer: 27
_target_: based.models.gpt.GPT2MixerConfig
rms_norm: true
fused_mlp: false
attn_pdrop: 0
embd_pdrop: 0
n_positions: 2048
resid_pdrop: 0
mlp_fc1_bias: false
mlp_fc2_bias: false
fused_bias_fc: true
out_proj_bias: false
qkv_proj_bias: false
use_flash_attn: true
residual_in_fp32: true
activation_function: "swiglu"
rotary_emb_fraction: 1 # flagging
fused_dropout_add_ln: true
max_position_embeddings: 0 # flagging
pad_vocab_size_multiple: 8
reorder_and_upcast_attn: false
scale_attn_by_inverse_layer_idx: false
n_inner: ${eval:2 * ${.n_embd}}