launcher_scripts/conf/config.yaml

defaults:
  - _self_
  - cluster: bcm  # Set to bcm for BCM and BCP clusters. Set to k8s for a k8s cluster.
  - data_curation: common_crawl/curate_common_crawl
  - data_preparation: gpt3/download_gpt3_pile #steerlm/steerlm_data_prep1 or steerlm/steerlm_data_prep2_reg
  - training: gpt3/5b
  - conversion: gpt3/convert_gpt3
  - conversion_hf2nemo: hf_llama2/convert_llama2_nemo
  - fw_inference: null
  - external_conversion: null
  - fine_tuning: null
  - generic: null
  - peft: null
  - prompt_learning: null
  - adapter_learning: null
  - ia3_learning: null
  - evaluation: gpt3/evaluate_all
  - export: gpt3/export_gpt3
  - rlhf_rm: gpt3/2b_rm
  - rlhf_ppo: gpt3/2b_ppo
  - steerlm_reg : ac_sft/gpt_sft # either rw_sft/training_rm or ac_sft/gpt_sft
  - ptq: model/quantization
  - rag_indexing: bert/340m
  - rag_generating: gpt3/125m
  - override hydra/job_logging: stdout

hydra:
  run:
    dir: .
  output_subdir: null

debug: False

stages:
  #- data_preparation
  #- training
  - conversion
  #- conversion_hf2nemo
  #- prompt_learning
  #- adapter_learning
  #- peft
  #- ia3_learning
  #- evaluation
  #- export
  #- steerlm_reg

cluster_type: bcm  # bcm, bcp, or k8s. If bcm or k8s, it must match - cluster above.
launcher_scripts_path: ???  # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts
data_dir: ${launcher_scripts_path}/data  # Location to store and read the data.
base_results_dir: ${launcher_scripts_path}/results  # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
  - null
container: nvcr.io/nvidia/nemo:24.05

wandb_api_key_file: null  # File where the w&B api key is stored. Key must be on the first line.
wandb_api_bcp_secret_key: null  # For BCP clusters, read the W&B api key directly from the environment variable set as a secret from BCP. The value must match the name of the environment variable in BCP, such as WANDB_TOKEN.

bcp_no_redirect: True  # If True, all stdout and stderr will not be redirected and appear in the standard logs. If False, all stdout and stderr output will be redirected to individual files on a per-rank basis. Ignored for non-BCP clusters.

env_vars:
  NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology
  UCX_IB_PCI_RELAXED_ORDERING: null # Needed to improve Azure performance
  NCCL_IB_PCI_RELAXED_ORDERING: null # Needed to improve Azure performance
  NCCL_IB_TIMEOUT: null # InfiniBand Verbs Timeout. Set to 22 for Azure
  NCCL_DEBUG: null # Logging level for NCCL. Set to "INFO" for debug information
  NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS
  TRANSFORMERS_OFFLINE: 0
  TORCH_NCCL_AVOID_RECORD_STREAMS: 1
  NCCL_NVLS_ENABLE: 0
  NVTE_DP_AMAX_REDUCE_INTERVAL: 0 # Diable FP8 AMAX reduction in the data-parallel domain
  NVTE_ASYNC_AMAX_REDUCTION: 1 # Enable asynchronous FP8 AMAX reduction
  NVTE_FUSED_ATTN: 0 # Disable cudnn FA until we've tested it more

# GPU Mapping
numa_mapping:
  enable: True  # Set to False to disable all mapping (performance will suffer).
  mode: unique_contiguous  # One of: all, single, single_unique, unique_interleaved or unique_contiguous.
  scope: node  # Either node or socket.
  cores: all_logical  # Either all_logical or single_logical.
  balanced: True  # Whether to assing an equal number of physical cores to each process.
  min_cores: 1  # Minimum number of physical cores per process.
  max_cores: 8  # Maximum number of physical cores per process. Can be null to use all available cores.

# Do not modify below, use the values above instead.
data_preparation_config: ${hydra:runtime.choices.data_preparation}
data_curation_config: ${hydra:runtime.choices.data_curation}
training_config: ${hydra:runtime.choices.training}
fine_tuning_config: ${hydra:runtime.choices.fine_tuning}
peft_config: ${hydra:runtime.choices.peft}
prompt_learning_config: ${hydra:runtime.choices.prompt_learning}
adapter_learning_config: ${hydra:runtime.choices.adapter_learning}
ia3_learning_config: ${hydra:runtime.choices.ia3_learning}
evaluation_config: ${hydra:runtime.choices.evaluation}
conversion_config: ${hydra:runtime.choices.conversion}
export_config: ${hydra:runtime.choices.export}
rlhf_rm_config: ${hydra:runtime.choices.rlhf_rm}
rlhf_ppo_config: ${hydra:runtime.choices.rlhf_ppo}
steerlm_reg_config : ${hydra:runtime.choices.steerlm_reg}
conversion_hf2nemo_config: ${hydra:runtime.choices.conversion_hf2nemo}
fw_inference_config: ${hydra:runtime.choices.fw_inference}
external_conversion_config: ${hydra:runtime.choices.external_conversion}
ptq_config: ${hydra:runtime.choices.ptq}
rag_indexing_config: ${hydra:runtime.choices.rag_indexing}
rag_generating_config: ${hydra:runtime.choices.rag_generating}