This a very raw intro into using `nemo_evaluator_launcher` API. 

Prerequisites:
* make sure you have the variables below set (note the API key is _name_ of the key, not value)

In [3]:
import warnings

warnings.simplefilter("ignore")

In [4]:
import tempfile

from omegaconf import OmegaConf

from nemo_evaluator_launcher.api import RunConfig, get_status, get_tasks_list, run_eval

MODEL_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
MODEL_ID = "nvdev/meta/llama-3.1-8b-instruct"
API_KEY_NAME = "NGC_API_TOKEN_NV_DEV"

In [5]:
# Show an example of getting task list
get_tasks_list()[:2]

[['ifeval',
  'chat',
  'lm-evaluation-harness',
  'gitlab-master.nvidia.com/dl/joc/competitive_evaluation/nvidia-core-evals/ci-llm/lm-evaluation-harness:2025-08-08T17-26-da3556fe'],
 ['mmlu_prox',
  'chat',
  'lm-evaluation-harness',
  'gitlab-master.nvidia.com/dl/joc/competitive_evaluation/nvidia-core-evals/ci-llm/lm-evaluation-harness:2025-08-08T17-26-da3556fe']]

In [6]:
# Check we have `ifeval`
print([t for t in get_tasks_list() if t[0] == "ifeval"])

[['ifeval', 'chat', 'lm-evaluation-harness', 'gitlab-master.nvidia.com/dl/joc/competitive_evaluation/nvidia-core-evals/ci-llm/lm-evaluation-harness:2025-08-08T17-26-da3556fe']]


In [7]:
# Creating a RunConfig.
cfg: RunConfig = RunConfig.from_hydra()
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: ???
deployment:
  type: none
target:
  api_endpoint:
    url: ???
    model_id: ???
    api_key_name: <YOUR_API_KEY_NAME>
evaluation:
  tasks: []



In [8]:
# To show examples of other deployments.
# It is strongly recommended to check the default composable configurations under `nemo_evaluator_launcher.configs`.
cfg: RunConfig = RunConfig.from_hydra(
    hydra_overrides=["deployment=vllm"]
)  # Consult the hydra.cc documentation
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: ???
deployment:
  type: vllm
  image: vllm/vllm-openai:latest
  checkpoint_path: ???
  served_model_name: ???
  port: 8000
  tensor_parallel_size: 8
  pipeline_parallel_size: 1
  data_parallel_size: 1
  extra_args: ''
  env_vars: {}
  endpoints:
    chat: /v1/chat/completions
    completions: /v1/completions
    health: /health
  command: vllm serve ${oc.select:deployment.hf_model_handle,/checkpoint} --tensor-parallel-size=${deployment.tensor_parallel_size}
    --pipeline-parallel-size=${deployment.pipeline_parallel_size} --data-parallel-size=${deployment.data_parallel_size}
    --port ${deployment.port} --trust-remote-code --served-model-name ${deployment.served_model_name}
    --enforce-eager --gpu-memory-utilization 0.95 ${deployment.extra_args}
target:
  api_endpoint:
    url: ???
    model_id: ???
    api_key_name: <YOUR_API_KEY_NAME>
evaluation:
  tasks: []



In [9]:
# Let's create a config from default, overriding parameters
cfg = RunConfig.from_hydra(
    # hydra-style overrides mainly will be used via CLI, here just for demonstration
    hydra_overrides=[
        "execution.output_dir=" + tempfile.mkdtemp(),
    ],
    dict_overrides={
        "target": {
            "api_endpoint": {
                "url": MODEL_URL,
                "model_id": MODEL_ID,
                "api_key_name": API_KEY_NAME,
            },
        },
        "evaluation": {
            "tasks": [
                {"name": "ifeval", "overrides": {"config.params.limit_samples": 2}},
            ],
        },
    },
)
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: /tmp/tmpi25vcoci
deployment:
  type: none
target:
  api_endpoint:
    url: https://integrate.api.nvidia.com/v1/chat/completions
    model_id: nvdev/meta/llama-3.1-8b-instruct
    api_key_name: NGC_API_TOKEN_NV_DEV
evaluation:
  tasks:
  - name: ifeval
    overrides:
      config.params.limit_samples: 2



In [10]:
# Now let's actually run
invocation_id = run_eval(cfg=cfg)


Commands for real-time monitoring:
  tail -f /tmp/tmpi25vcoci/20250919_172014-15b9f667/ifeval/logs/stdout.log

Follow all logs for this invocation:
  tail -f /tmp/tmpi25vcoci/20250919_172014-15b9f667/*/logs/stdout.log


In [11]:
# Check status
stat = get_status(job_ids=[invocation_id])
stat

[{'invocation': '15b9f667',
  'job_id': '15b9f667.0',
  'status': 'running',
  'progress': {'progress': None},
  'data': {'output_dir': '/tmp/tmpi25vcoci/20250919_172014-15b9f667/ifeval',
   'container': 'ifeval-20250919_172014_863200',
   'eval_image': 'gitlab-master.nvidia.com/dl/joc/competitive_evaluation/nvidia-core-evals/ci-llm/lm-evaluation-harness:2025-08-08T17-26-da3556fe'}}]

In [12]:
!ls {stat[0]["data"]["output_dir"]}

artifacts  logs  run.sh


In [13]:
!ls {stat[0]["data"]["output_dir"]}/logs

stage.pre-start  stage.running	stdout.log


In [14]:
!cat {stat[0]["data"]["output_dir"]}/logs/stdout.log | tail -n 15

Checking cached requests: 100% 2/2 [00:00<00:00, 10168.01it/s]
2025-09-19:15:20:21 INFO     [api.model:400] Cached requests: 0, Requests remaining: 2
2025-09-19:15:20:21 INFO     [models.api_models:690] Tokenized requests are disabled. Context + generation length is not checked.
Requesting API: 100% 2/2 [00:02<00:00,  1.45s/it]
fatal: not a git repository (or any of the parent directories): .git
2025-09-19:15:20:24 INFO     [loggers.evaluation_tracker:209] Saving results aggregated
2025-09-19:15:20:24 INFO     [loggers.evaluation_tracker:290] Saving per-sample results for: ifeval
local-chat-completions (base_url=https://integrate.api.nvidia.com/v1/chat/completions,model=nvdev/meta/llama-3.1-8b-instruct,tokenized_requests=False,,tokenizer_backend=None,num_concurrent=10,timeout=30,max_retries=5,stream=False), gen_kwargs: (temperature=1e-07,top_p=0.9999999), limit: 2.0, num_fewshot: None, batch_size: 1
|Tasks |Version|Filter|n-shot|        Metric         |   |Value|   |Stderr|
|------|---

In [15]:
!cat {stat[0]["data"]["output_dir"]}/logs/stdout.log | tail -n 15

          scores:
            prompt_level_strict_acc:
              stats:
                stderr: 0.0
              value: 1.0
target:
  api_endpoint:
    adapter_config: null
    api_key: API_KEY
    model_id: nvdev/meta/llama-3.1-8b-instruct
    stream: null
    type: chat
    url: https://integrate.api.nvidia.com/v1/chat/completions
 [nvidia_eval_commons.core.evaluate]
Container completed successfully


### Running with custom config

In [16]:
# Let's use the example one
EXAMPLE_FOLDER = "./examples"
cfg: RunConfig = RunConfig.from_hydra(
    config_dir="../", config_name="local_llama_3_1_8b_instruct"
)
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: llama_3_1_8b_instruct_results
deployment:
  type: none
target:
  api_endpoint:
    model_id: meta/llama-3.1-8b-instruct
    url: https://integrate.api.nvidia.com/v1/chat/completions
    api_key_name: API_KEY
evaluation:
  overrides:
    config.params.request_timeout: 3600
    target.api_endpoint.adapter_config.use_reasoning: false
    target.api_endpoint.adapter_config.use_system_prompt: true
    target.api_endpoint.adapter_config.custom_system_prompt: '"Think step by step."'
  tasks:
  - name: ifeval
  - name: gpqa_diamond
    overrides:
      config.params.temperature: 0.6
      config.params.top_p: 0.95
      config.params.max_new_tokens: 8192
      config.params.parallelism: 32
    env_vars:
      HF_TOKEN: HF_TOKEN_FOR_GPQA_DIAMOND
  - name: mbpp
    overrides:
      config.params.temperature: 0.2
      config.params.top_p: 0.95
      config.params.max_new_tokens: 2048
      config.params.extra.n_samples: 5
      config.params.parallelism: 32