In [1]:
import nemo_run as run
from typing import Optional
import pytorch_lightning as pl
from nemo.collections import llm
from nemo.collections.common.tokenizers import SentencePieceTokenizer

  from .autonotebook import tqdm as notebook_tqdm
      cm = get_cmap("Set1")
    


In [2]:
def slimpajama(
    gbs: int = 256,
    mbs: int = 4,
    seq_length: int = 8192,
) -> run.Config[pl.LightningDataModule]:

    return run.Config(
        llm.PreTrainingDataModule,
        paths=["/data/slimpajama_megatron/concatenated_chunk1.jsonl_text_document"],
        seq_length=seq_length,
        global_batch_size=gbs,
        micro_batch_size=mbs,
        tokenizer=run.Config(SentencePieceTokenizer, model_path="/data/tokenizer/tokenizer.model"),
        split="99990,8,2",
        num_workers=2,
        index_mapping_dir="/data/index_mapping",
    )

In [3]:
def configure_recipe(nodes: int = 1, gpus_per_node: int = 1):
    recipe = llm.llama3_8b.pretrain_recipe(
        dir="/checkpoints/llama-new", # Path to store checkpoints
        name="llama_pretraining",
        num_nodes=nodes,
        num_gpus_per_node=gpus_per_node,
    )

    recipe.model.config.num_layers = 1
    recipe.model.config.hidden_size = 128
    recipe.trainer.max_steps = 30
    recipe.data = slimpajama(
        gbs=32,
        mbs=1,
    )
    recipe.trainer.val_check_interval = 20
    recipe.trainer.strategy.context_parallel_size = 1
    recipe.log.ckpt.save_optim_on_train_end = True
    return recipe

In [4]:
def local_executor_torchrun(nodes: int = 1, devices: int = 1) -> run.LocalExecutor:
    # Env vars for jobs are configured here
    env_vars = {
        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
        "NEMO_ENV_VARNAME_TESTING": "1",
        "CUDA_VISIBLE_DEVICES": "0"
    }

    executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
    return executor


In [5]:
def run_pretraining():
    recipe = configure_recipe()
    executor = local_executor_torchrun(nodes=recipe.trainer.num_nodes, devices=recipe.trainer.devices)

    run.run(recipe, executor=executor)

In [None]:
run_pretraining()

Log directory is: /root/.nemo_run/experiments/nemo.collections.llm.api.pretrain/nemo.collections.llm.api.pretrain_1730156175/nemo.collections.llm.api.pretrain


Log directory is: /root/.nemo_run/experiments/nemo.collections.llm.api.pretrain/nemo.collections.llm.api.pretrain_1730156175/nemo.collections.llm.api.pretrain
Launched app: local_persistent://nemo_run/nemo.collections.llm.api.pretrain-s0ccr7w39dv59c
AppStatus:
    State: RUNNING
    Num Restarts: 0
    Roles: 
    Msg: <NONE>
    Structured Error Msg: <NONE>
    UI URL: file:///root/.nemo_run/experiments/nemo.collections.llm.api.pretrain/nemo.collections.llm.api.pretrain_1730156175/nemo.collections.llm.api.pretrain/nemo_run/nemo.collections.llm.api.pretrain-s0ccr7w39dv59c
    


Waiting for job nemo.collections.llm.api.pretrain-s0ccr7w39dv59c to finish [log=True]...


i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188] Starting elastic_operator with launch configs:
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   entrypoint       : nemo_run.core.runners.fdl_runner
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   min_nodes        : 1
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   max_nodes        : 1
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   nproc_per_node   : 1
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   run_id           : 8678
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   rdzv_backend     : c10d
i.pretrain/0 I1028 22:56:16.514000 140701987661632 torch/distributed/launcher/api.py:188]   rdzv_endpoint    : localhost:0
i.pretrain/0 I1028 22:56: