This a very raw intro into using `nemo_evaluator_launcher` API. 

Prerequisites:
* make sure you have the variables below set (note the API key is _name_ of the key, not value)

In [1]:
import tempfile

from omegaconf import OmegaConf

from nemo_evaluator_launcher.api import RunConfig, get_status, get_tasks_list, run_eval

MODEL_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
MODEL_ID = "nvdev/meta/llama-3.1-8b-instruct"
API_KEY_NAME = "NGC_API_TOKEN_NV_DEV"

In [2]:
# Show an example of getting task list
get_tasks_list()[:2]

[['AA_AIME_2024',
  'chat',
  'simple-evals',
  'nvcr.io/nvidia/eval-factory/simple-evals:25.08'],
 ['AA_math_test_500',
  'chat',
  'simple-evals',
  'nvcr.io/nvidia/eval-factory/simple-evals:25.08']]

In [3]:
# Check we have `ifeval`
print([t for t in get_tasks_list() if t[0] == "ifeval"])

[['ifeval', 'chat', 'lm-evaluation-harness', 'nvcr.io/nvidia/eval-factory/lm-evaluation-harness:25.08']]


In [4]:
# Creating a RunConfig.
cfg: RunConfig = RunConfig.from_hydra()
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: ???
deployment:
  type: none
target:
  api_endpoint:
    url: ???
    model_id: ???
    api_key_name: <YOUR_API_KEY_NAME>
evaluation: []



In [5]:
# To show examples of other deployments.
# It is strongly recommended to check the default composable configurations under `nemo_evaluator_launcher.configs`.
cfg: RunConfig = RunConfig.from_hydra(
    hydra_overrides=["deployment=vllm"]
)  # Consult the hydra.cc documentation
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: ???
deployment:
  type: vllm
  image: vllm/vllm-openai:latest
  checkpoint_path: ???
  served_model_name: ???
  port: 8000
  tensor_parallel_size: 8
  pipeline_parallel_size: 1
  data_parallel_size: 1
  extra_args: ''
  endpoints:
    chat: /v1/chat/completions
    completions: /v1/completions
    health: /health
  command: vllm serve /checkpoint --tensor-parallel-size=${deployment.tensor_parallel_size}
    --pipeline-parallel-size=${deployment.pipeline_parallel_size} --data-parallel-size=${deployment.data_parallel_size}
    --port ${deployment.port} --trust-remote-code --served-model-name ${deployment.served_model_name}
    --enforce-eager --gpu-memory-utilization 0.95 ${deployment.extra_args}
target:
  api_endpoint:
    url: ???
    model_id: ???
    api_key_name: <YOUR_API_KEY_NAME>
evaluation: []



In [6]:
# Let's create a config from default, overriding parameters
cfg = RunConfig.from_hydra(
    # hydra-style overrides mainly will be used via CLI, here just for demonstration
    hydra_overrides=[
        "execution.output_dir=" + tempfile.mkdtemp(),
    ],
    dict_overrides={
        "target": {
            "api_endpoint": {
                "url": MODEL_URL,
                "model_id": MODEL_ID,
                "api_key_name": API_KEY_NAME,
            },
        },
        "evaluation": [
            {"name": "ifeval", "overrides": {"config.params.limit_samples": 2}},
        ],
    },
)
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: /tmp/tmps3zkzzul
deployment:
  type: none
target:
  api_endpoint:
    url: https://integrate.api.nvidia.com/v1/chat/completions
    model_id: nvdev/meta/llama-3.1-8b-instruct
    api_key_name: NGC_API_TOKEN_NV_DEV
evaluation:
- name: ifeval
  overrides:
    config.params.limit_samples: 2



In [7]:
# Now let's actually run
invocation_id = run_eval(cfg=cfg)

In [17]:
# Check status
stat = get_status(job_ids=[invocation_id])
stat

[{'invocation': '722d6f9e',
  'job_id': '722d6f9e.0',
  'status': 'success',
  'data': {'output_dir': '/tmp/tmps3zkzzul/ifeval'}}]

In [18]:
!ls {stat[0]["data"]["output_dir"]}

artifacts  logs


In [19]:
!ls {stat[0]["data"]["output_dir"]}/logs

stage.exit  stage.pre-start  stage.running  stdout.log


In [20]:
!cat {stat[0]["data"]["output_dir"]}/logs/stdout.log | tail -n 15

              value: 1.0
        prompt_level_strict_acc:
          scores:
            prompt_level_strict_acc:
              stats:
                stderr: 0.0
              value: 1.0
target:
  api_endpoint:
    api_key: API_KEY
    model_id: nvdev/meta/llama-3.1-8b-instruct
    type: chat
    url: https://integrate.api.nvidia.com/v1/chat/completions

Container completed successfully


### Running with custom config

In [21]:
# Let's use the example one
EXAMPLE_FOLDER = "./examples"
cfg: RunConfig = RunConfig.from_hydra(
    config_dir="../", config_name="local_llama_3_1_8b_instruct"
)
print(OmegaConf.to_yaml(cfg))

execution:
  type: local
  output_dir: llama_3_1_8b_instruct_results
  env_var_names:
  - HF_TOKEN
deployment:
  type: none
target:
  api_endpoint:
    model_id: meta/llama-3.1-8b-instruct
    url: https://integrate.api.nvidia.com/v1/chat/completions
    api_key_name: API_KEY
evaluation:
- name: ifeval
- name: gpqa_diamond
  overrides:
    config.params.temperature: 0.6
    config.params.top_p: 0.95
    config.params.max_new_tokens: 8192
    config.params.parallelism: 32
    target.api_endpoint.adapter_config.use_reasoning: false
    target.api_endpoint.adapter_config.use_system_prompt: true
    target.api_endpoint.adapter_config.custom_system_prompt: '''Think step by step.'''
- name: mbpp

