-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
JonasGeiping
committed
Jun 13, 2024
1 parent
698e994
commit 2e7e57a
Showing
7 changed files
with
75 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| enabled: True | ||
| entity: YOURNAMEHERE | ||
| entity: jonasgeiping # change this obviously ;> | ||
| project: cramming-pretrain | ||
| tags: [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,17 +1,50 @@ | ||
|
|
||
|
|
||
| # Sanity checks for pytorch issue https://github.com/pytorch/pytorch/issues/96693 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 | ||
|
|
||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl.tf32_allowed=False seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.tf32_allowed=False seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True impl.tf32_allowed=False seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True impl.tf32_allowed=False seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune impl.tf32_allowed=False seed=233 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 | ||
|
|
||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune impl.tf32_allowed=False seed=233 | ||
| # python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs impl.tf32_allowed=False seed=233 | ||
|
|
||
| # all follow the same curve: | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| python pretrain.py name=DA6000amp_b8192_cb_o4_premade_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 | ||
|
|
||
|
|
||
| CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 | ||
|
|
||
|
|
||
| CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2 | ||
| CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2 | ||
|
|
||
|
|
||
|
|
||
| # invoke cache skip + cuidagraphs: +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True | ||
|
|
||
| CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_det_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True | ||
| CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True | ||
|
|
||
|
|
||
| CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False | ||
| CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True | ||
| CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_default_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False | ||
| CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True | ||
|
|
||
| # torch._dynamo.reset()? | ||
| # torch.compiler.reset? |