-
Notifications
You must be signed in to change notification settings - Fork 124
/
config.yaml
executable file
·104 lines (95 loc) · 4.67 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
defaults:
- _self_
- cluster: bcm # Set to bcm for BCM and BCP clusters. Set to k8s for a k8s cluster.
- data_curation: common_crawl/curate_common_crawl
- data_preparation: gpt3/download_gpt3_pile #steerlm/steerlm_data_prep1 or steerlm/steerlm_data_prep2_reg
- training: gpt3/5b
- conversion: gpt3/convert_gpt3
- conversion_hf2nemo: hf_llama2/convert_llama2_nemo
- fw_inference: null
- external_conversion: null
- fine_tuning: null
- generic: null
- peft: null
- prompt_learning: null
- adapter_learning: null
- ia3_learning: null
- evaluation: gpt3/evaluate_all
- export: gpt3/export_gpt3
- rlhf_rm: gpt3/2b_rm
- rlhf_ppo: gpt3/2b_ppo
- steerlm_reg : ac_sft/gpt_sft # either rw_sft/training_rm or ac_sft/gpt_sft
- ptq: model/quantization
- rag_indexing: bert/340m
- rag_generating: gpt3/125m
- override hydra/job_logging: stdout
hydra:
run:
dir: .
output_subdir: null
debug: False
stages:
#- data_preparation
#- training
- conversion
#- conversion_hf2nemo
#- prompt_learning
#- adapter_learning
#- peft
#- ia3_learning
#- evaluation
#- export
#- steerlm_reg
cluster_type: bcm # bcm, bcp, or k8s. If bcm or k8s, it must match - cluster above.
launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
data_dir: ${launcher_scripts_path}/data # Location to store and read the data.
base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
- null
container: nvcr.io/nvidia/nemo:24.05
wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line.
wandb_api_bcp_secret_key: null # For BCP clusters, read the W&B api key directly from the environment variable set as a secret from BCP. The value must match the name of the environment variable in BCP, such as WANDB_TOKEN.
bcp_no_redirect: True # If True, all stdout and stderr will not be redirected and appear in the standard logs. If False, all stdout and stderr output will be redirected to individual files on a per-rank basis. Ignored for non-BCP clusters.
env_vars:
NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology
UCX_IB_PCI_RELAXED_ORDERING: null # Needed to improve Azure performance
NCCL_IB_PCI_RELAXED_ORDERING: null # Needed to improve Azure performance
NCCL_IB_TIMEOUT: null # InfiniBand Verbs Timeout. Set to 22 for Azure
NCCL_DEBUG: null # Logging level for NCCL. Set to "INFO" for debug information
NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS
TRANSFORMERS_OFFLINE: 0
TORCH_NCCL_AVOID_RECORD_STREAMS: 1
NCCL_NVLS_ENABLE: 0
NVTE_DP_AMAX_REDUCE_INTERVAL: 0 # Diable FP8 AMAX reduction in the data-parallel domain
NVTE_ASYNC_AMAX_REDUCTION: 1 # Enable asynchronous FP8 AMAX reduction
NVTE_FUSED_ATTN: 0 # Disable cudnn FA until we've tested it more
# GPU Mapping
numa_mapping:
enable: True # Set to False to disable all mapping (performance will suffer).
mode: unique_contiguous # One of: all, single, single_unique, unique_interleaved or unique_contiguous.
scope: node # Either node or socket.
cores: all_logical # Either all_logical or single_logical.
balanced: True # Whether to assing an equal number of physical cores to each process.
min_cores: 1 # Minimum number of physical cores per process.
max_cores: 8 # Maximum number of physical cores per process. Can be null to use all available cores.
# Do not modify below, use the values above instead.
data_preparation_config: ${hydra:runtime.choices.data_preparation}
data_curation_config: ${hydra:runtime.choices.data_curation}
training_config: ${hydra:runtime.choices.training}
fine_tuning_config: ${hydra:runtime.choices.fine_tuning}
peft_config: ${hydra:runtime.choices.peft}
prompt_learning_config: ${hydra:runtime.choices.prompt_learning}
adapter_learning_config: ${hydra:runtime.choices.adapter_learning}
ia3_learning_config: ${hydra:runtime.choices.ia3_learning}
evaluation_config: ${hydra:runtime.choices.evaluation}
conversion_config: ${hydra:runtime.choices.conversion}
export_config: ${hydra:runtime.choices.export}
rlhf_rm_config: ${hydra:runtime.choices.rlhf_rm}
rlhf_ppo_config: ${hydra:runtime.choices.rlhf_ppo}
steerlm_reg_config : ${hydra:runtime.choices.steerlm_reg}
conversion_hf2nemo_config: ${hydra:runtime.choices.conversion_hf2nemo}
fw_inference_config: ${hydra:runtime.choices.fw_inference}
external_conversion_config: ${hydra:runtime.choices.external_conversion}
ptq_config: ${hydra:runtime.choices.ptq}
rag_indexing_config: ${hydra:runtime.choices.rag_indexing}
rag_generating_config: ${hydra:runtime.choices.rag_generating}