train/configs/experiment/example/based-360m.yaml

# @package _global_
defaults:
  - /experiment/pile/gpt3m-flash.yaml
  - override /datamodule: wikitext103

train: 
  optimizer:
    lr: 0.0008
    betas: [0.9, 0.95]
    _target_: apex.optimizers.FusedAdam
    adam_w_mode: true
    weight_decay: 0.1
  
  scheduler: 
    lr_min: 0.00008
    _target_: train.optim.timm_lr_scheduler.TimmCosineLRScheduler
    warmup_t: 200
    t_initial: 19800
    t_in_epochs: false
    warmup_prefix: true
    warmup_lr_init: 0.000001

trainer: 
  # this interval is in terms of batch_idx not in terms of global_step, so we need 
  # to multiply by accumulate_grad_batches
  val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
  max_steps: 20000


datamodule:
  batch_size: 8  # per gpu
  batch_size_eval: 32

expt_name: 02-20-based-360m
name: ${.expt_name}

callbacks:
  model_checkpoint:
    dirpath: /var/cr01_data/sabri_data/checkpoints/${expt_name}

resume: True
do_test: True

model:
  _target_: based.models.gpt.GPTLMHeadModel
  _recursive_: false
  config:
    alt_mixer_layers: 
      - 1
      - 6
      - 11
      - 16
      - 21

    alt_mixer_2_layers:
      - 2
      - 7
      - 12
      - 17
      - 22

    mixer:
      _target_: based.models.mixers.convolution.BaseConv
      l_max: ${....datamodule.max_length}
      use_bias: True
      expand_proj: 4
      kernel_sizes: 3
    
    alt_mixer: 
      _target_: based.models.mixers.linear_attention.LinearAttention  
      l_max: ${....datamodule.max_length}
      feature_map: 
        _target_: based.models.mixers.linear_attention.TaylorExp
        input_dim: ${..feature_dim}
      feature_dim: 16
      num_heads: 16

    alt_mixer_2:
      _target_: based.models.mixers.slide_attention.SlidingAttention
      window_size: 128 
      num_heads: 16
      causal: true

    n_embd: 1024

    special_initializer: true
    n_head: 16
    n_layer: 27
    _target_: based.models.gpt.GPT2MixerConfig
    rms_norm: true
    fused_mlp: false
    attn_pdrop: 0
    embd_pdrop: 0
    n_positions: 2048
    resid_pdrop: 0
    mlp_fc1_bias: false
    mlp_fc2_bias: false
    fused_bias_fc: true
    out_proj_bias: false
    qkv_proj_bias: false
    use_flash_attn: true
    residual_in_fp32: true
    activation_function: "swiglu"    
    rotary_emb_fraction: 1           # flagging 
    fused_dropout_add_ln: true
    max_position_embeddings: 0   # flagging 
    pad_vocab_size_multiple: 8
    reorder_and_upcast_attn: false
    scale_attn_by_inverse_layer_idx: false

    n_inner: ${eval:2 * ${.n_embd}}