In [1]:
import os
import json
import requests
import random
from time import sleep, time
from openai import OpenAI
import asyncio

from config import *

In [2]:
os.environ["NVIDIA_DATASET_NAMESPACE"] = NMS_NAMESPACE
os.environ["NVIDIA_PROJECT_ID"] = PROJECT_ID

## Inference env vars
os.environ["NVIDIA_BASE_URL"] = NIM_URL

# Data Store env vars
os.environ["NVIDIA_DATASETS_URL"] = ENTITY_STORE_URL

## Customizer env vars
os.environ["NVIDIA_CUSTOMIZER_URL"] = CUSTOMIZER_URL
os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = CUSTOMIZED_MODEL_DIR

# Evaluator env vars
os.environ["NVIDIA_EVALUATOR_URL"] = EVALUATOR_URL

# Guardrails env vars
os.environ["GUARDRAILS_SERVICE_URL"] = GUARDRAILS_URL


In [3]:
from llama_stack.core.library_client import LlamaStackAsLibraryClient

client = LlamaStackAsLibraryClient("nvidia")
client.initialize()

OTEL_EXPORTER_OTLP_ENDPOINT is not set, skipping telemetry


In [4]:
from llama_stack.apis.common.job_types import JobStatus
from llama_stack.core.datatypes import Api

async def wait_eval_job(benchmark_id: str, job_id: str, polling_interval: int = 10, timeout: int = 6000):
    start_time = time()
    
    # Access eval through impls
    eval_impl = client.async_client.impls[Api.eval]
    
    job_status = await eval_impl.job_status(benchmark_id=benchmark_id, job_id=job_id)

    print(f"Waiting for Evaluation job {job_id} to finish.")
    print(f"Job status: {job_status.status} after {time() - start_time} seconds.")

    while job_status.status in [JobStatus.scheduled, JobStatus.in_progress]:
        await asyncio.sleep(polling_interval)
        job_status = await eval_impl.job_status(benchmark_id=benchmark_id, job_id=job_id)

        print(f"Job status: {job_status.status} after {time() - start_time} seconds.")

        if time() - start_time > timeout:
            raise RuntimeError(f"Evaluation Job {job_id} took more than {timeout} seconds.")

    return job_status

In [5]:
print(f"Data Store endpoint: {DATA_STORE_URL}")
print(f"Entity Store endpoint: {ENTITY_STORE_URL}")
print(f"Customizer endpoint: {CUSTOMIZER_URL}")
print(f"Evaluator endpoint: {EVALUATOR_URL}")
print(f"NIM endpoint: {NIM_URL}")
print(f"Namespace: {NMS_NAMESPACE}")
print(f"Base Model: {BASE_MODEL}")

Data Store endpoint: http://nemodatastore-sample.hacohen-nemo.svc.cluster.local:8000
Entity Store endpoint: http://nemoentitystore-sample.hacohen-nemo.svc.cluster.local:8000
Customizer endpoint: http://nemocustomizer-sample.hacohen-nemo.svc.cluster.local:8000
Evaluator endpoint: http://nemoevaluator-sample.hacohen-nemo.svc.cluster.local:8000
NIM endpoint: http://meta-llama3-1b-instruct.hacohen-nemo.svc.cluster.local:8000
Namespace: xlam-tutorial-ns
Base Model: meta/llama-3.2-1b-instruct


In [6]:
CUSTOMIZED_MODEL = "nvidia-tool-calling-tutorial/test-llama-stack@v1"

In [7]:
models = client.models.list()
model_ids = [model.identifier for model in models]

assert f"nvidia/{CUSTOMIZED_MODEL}" in model_ids, \
    f"Model {CUSTOMIZED_MODEL} not registered"

In [8]:
resp = requests.get(f"{NIM_URL}/v1/models")

models = resp.json().get("data", [])
model_names = [model["id"] for model in models]

assert CUSTOMIZED_MODEL in model_names, \
    f"Model {CUSTOMIZED_MODEL} not found"

In [9]:
model_names, model_ids

(['meta/llama-3.2-1b-instruct',
  'nvidia-tool-calling-tutorial/test-llama-stack@v1'],
 ['nvidia/meta/llama-3.2-1b-instruct',
  'nvidia/nvidia-tool-calling-tutorial/test-llama-stack@v1'])

In [10]:
repo_id = f"{NMS_NAMESPACE}/{DATASET_NAME}"
print(repo_id)

xlam-tutorial-ns/xlam-ft-dataset


In [11]:
datasets = client.datasets.list()
dataset_ids = [dataset.identifier for dataset in datasets]
assert DATASET_NAME in dataset_ids, \
    f"Dataset {DATASET_NAME} not registered"

In [12]:
dataset_ids

['xlam-ft-dataset']

In [13]:
response = requests.get(url=f"{ENTITY_STORE_URL}/v1/datasets/{repo_id}")
assert response.status_code in (200, 201), f"Status Code {response.status_code} Failed to fetch dataset {response.text}"

print("Files URL:", response.json()["files_url"])

Files URL: hf://datasets/xlam-tutorial-ns/xlam-ft-dataset


In [14]:
benchmark_id = "simple-tool-calling-1"
simple_tool_calling_eval_config = {
    "type": "custom",
    "tasks": {
        "custom-tool-calling": {
            "type": "chat-completion",
            "dataset": {
                "files_url": f"hf://datasets/{NMS_NAMESPACE}/{DATASET_NAME}/testing/xlam-test-single.jsonl",
                "limit": 50
            },
            "params": {
                "template": {
                    "messages": "{{ item.messages | tojson}}",
                    "tools": "{{ item.tools | tojson }}",
                    "tool_choice": "auto"
                }
            },
            "metrics": {
                "tool-calling-accuracy": {
                    "type": "tool-calling",
                    "params": {"tool_calls_ground_truth": "{{ item.tool_calls | tojson }}"}
                }
            }
        }
    }
}


In [15]:
response = client.benchmarks.register(
    benchmark_id=benchmark_id,
    dataset_id=repo_id,
    scoring_functions=[],
    metadata=simple_tool_calling_eval_config
)

In [18]:
import requests
import json

# Try registering with colon instead of slash (meta:llama-3.2-1b-instruct)
# Or see if we can create an alias
model_payload = {
    "namespace": "meta",  # Use meta as the namespace
    "name": "llama-3.2-1b-instruct",
    "description": "Base Llama 3.2 1B Instruct model",
    "type": "llm",
}

try:
    response = requests.post(
        f"{ENTITY_STORE_URL}/v1/models",
        json=model_payload
    )
    response.raise_for_status()
    print("✓ Registered model in 'meta' namespace!")
    print(json.dumps(response.json(), indent=2))
except requests.HTTPError as e:
    print(f"Status: {e.response.status_code}")
    print(f"Response: {e.response.text}")


✓ Registered model in 'meta' namespace!
{
  "created_at": "2025-11-06T22:30:23.000366",
  "updated_at": "2025-11-06T22:30:23.000368",
  "name": "llama-3.2-1b-instruct",
  "namespace": "meta",
  "description": "Base Llama 3.2 1B Instruct model",
  "spec": null,
  "artifact": null,
  "base_model": null,
  "api_endpoint": null,
  "peft": null,
  "prompt": null,
  "guardrails": null,
  "schema_version": "1.0",
  "project": null,
  "custom_fields": {},
  "ownership": null
}


In [19]:
# import requests
# import json

# # Delete the existing model (already done, but just in case)
# try:
#     response = requests.delete(f"{ENTITY_STORE_URL}/v1/models/meta/llama-3.2-1b-instruct")
#     print(f"Delete status: {response.status_code}")
# except Exception as e:
#     print(f"Delete error (ok if 404): {e}")

# # Create with the API endpoint - let's see the exact error
# model_payload = {
#     "namespace": "meta",
#     "name": "llama-3.2-1b-instruct",
#     "description": "Base Llama 3.2 1B Instruct model",
#     "type": "llm",
#     "api_endpoint": NIM_URL  # Add the NIM endpoint
# }

# print(f"\nPayload:")
# print(json.dumps(model_payload, indent=2))
# print(f"\nNIM_URL value: {NIM_URL}")

# try:
#     response = requests.post(
#         f"{ENTITY_STORE_URL}/v1/models",
#         json=model_payload
#     )
#     response.raise_for_status()
#     print("✓ Created model with API endpoint!")
#     print(json.dumps(response.json(), indent=2))
# except requests.HTTPError as e:
#     print(f"✗ Status: {e.response.status_code}")
#     print(f"Error details: {e.response.text}")

In [23]:
from llama_stack.core.datatypes import Api
from llama_stack.apis.eval import BenchmarkConfig, EvalCandidate

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Create the benchmark config using proper data types
from llama_stack.apis.eval import ModelCandidate, SamplingParams

benchmark_config = BenchmarkConfig(
    eval_candidate=ModelCandidate(
        type="model",
        model=BASE_MODEL,
        sampling_params=SamplingParams()
    )
)

# Create evaluation job
response = await eval_impl.run_eval(
    benchmark_id=benchmark_id,
    benchmark_config=benchmark_config
)

job_id = response.job_id
print(f"Created evaluation job: {job_id}")



Created evaluation job: eval-5qTD3dDTcRfHKdbow1m7GX


In [24]:
job = await wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)

Waiting for Evaluation job eval-5qTD3dDTcRfHKdbow1m7GX to finish.
Job status: JobStatus.in_progress after 0.009132623672485352 seconds.
Job status: JobStatus.in_progress after 5.022626161575317 seconds.
Job status: JobStatus.in_progress after 10.036325931549072 seconds.
Job status: JobStatus.in_progress after 15.051011562347412 seconds.
Job status: JobStatus.in_progress after 20.0600528717041 seconds.
Job status: JobStatus.completed after 25.07395601272583 seconds.


In [29]:
import requests
import json

# Get the full job details to see the error
response = requests.get(f"{EVALUATOR_URL}/v1/evaluation/jobs/{job_id}")
response.raise_for_status()
job_details = response.json()

print("Job status:", job_details.get("status"))
print("\nStatus details:")
if "status_details" in job_details:
    print(json.dumps(job_details["status_details"], indent=2))
    
# print("\nFull job details:")
# print(json.dumps(job_details, indent=2))

Job status: completed

Status details:
{
  "message": "Job completed successfully.",
  "task_status": {
    "custom-tool-calling": "completed"
  },
  "progress": 100.0
}


In [34]:
# import requests
# import json

# # List all models in the Entity Store
# response = requests.get(f"{ENTITY_STORE_URL}/v1/models")
# response.raise_for_status()
# models = response.json()

# print("Models in Entity Store:")
# print(json.dumps(models, indent=2))



Available client attributes:
['alpha', 'api_key', 'async_client', 'auth_headers', 'base_url', 'benchmarks', 'chat', 'close', 'completions', 'conversations', 'copy', 'custom_auth', 'datasets', 'default_headers', 'default_query', 'delete', 'embeddings', 'files', 'get', 'get_api_list', 'initialize', 'inspect', 'is_closed', 'loop', 'max_retries', 'models', 'moderations', 'patch', 'platform_headers', 'post', 'provider_data', 'providers', 'put', 'qs', 'request', 'responses', 'routes', 'safety', 'scoring', 'scoring_functions', 'shields', 'synthetic_data_generation', 'telemetry', 'timeout', 'tool_runtime', 'toolgroups', 'tools', 'user_agent', 'vector_io', 'vector_stores', 'with_options', 'with_raw_response', 'with_streaming_response']
Client does not have eval/evaluations/evaluation attributes


In [35]:
from llama_stack.core.datatypes import Api

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Retrieve job results
job_results = await eval_impl.job_result(benchmark_id=benchmark_id, job_id=job_id)
print(f"Job results: {json.dumps(job_results.model_dump(), indent=2)}")

Job results: {
  "generations": [],
  "scores": {
    "simple-tool-calling-1": {
      "score_rows": [],
      "aggregated_results": {
        "created_at": "2025-11-06T22:33:37.013669",
        "updated_at": "2025-11-06T22:33:37.013670",
        "id": "evaluation_result-25WgaVDdJxCiniWZkT6JHY",
        "job": "eval-5qTD3dDTcRfHKdbow1m7GX",
        "tasks": {
          "custom-tool-calling": {
            "metrics": {
              "tool-calling-accuracy": {
                "scores": {
                  "function_name_accuracy": {
                    "value": 0.02,
                    "stats": {
                      "count": 50,
                      "sum": 1.0,
                      "mean": 0.02
                    }
                  },
                  "function_name_and_args_accuracy": {
                    "value": 0.0,
                    "stats": {
                      "count": 50,
                      "sum": 0.0,
                      "mean": 0.0
                    }
     

In [36]:
aggregated_results = job_results.scores[benchmark_id].aggregated_results
base_function_name_accuracy_score = aggregated_results["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_accuracy"]["value"]
base_function_name_and_args_accuracy = aggregated_results["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_and_args_accuracy"]["value"]

print(f"Base model: function_name_accuracy: {base_function_name_accuracy_score}")
print(f"Base model: function_name_and_args_accuracy: {base_function_name_and_args_accuracy}")

Base model: function_name_accuracy: 0.02
Base model: function_name_and_args_accuracy: 0.0


In [37]:
from llama_stack.core.datatypes import Api
from llama_stack.apis.eval import BenchmarkConfig, EvalCandidate

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Create the benchmark config using proper data types
from llama_stack.apis.eval import ModelCandidate, SamplingParams

benchmark_config = BenchmarkConfig(
    eval_candidate=ModelCandidate(
        type="model",
        model=CUSTOMIZED_MODEL,
        sampling_params=SamplingParams()
    )
)

# Create evaluation job
response = await eval_impl.run_eval(
    benchmark_id=benchmark_id,
    benchmark_config=benchmark_config
)

job_id = response.job_id
print(f"Created evaluation job: {job_id}")

Created evaluation job: eval-BuCn79PVV4nus5o5icY8Wz


In [38]:
job = await wait_eval_job(benchmark_id=benchmark_id, job_id=job_id, polling_interval=5, timeout=600)

Waiting for Evaluation job eval-BuCn79PVV4nus5o5icY8Wz to finish.
Job status: JobStatus.in_progress after 0.04833269119262695 seconds.
Job status: JobStatus.in_progress after 5.064541578292847 seconds.
Job status: JobStatus.in_progress after 10.078007936477661 seconds.
Job status: JobStatus.completed after 15.09185266494751 seconds.


In [39]:
from llama_stack.core.datatypes import Api

# Access eval through impls
eval_impl = client.async_client.impls[Api.eval]

# Retrieve job results
job_results = await eval_impl.job_result(benchmark_id=benchmark_id, job_id=job_id)
print(f"Job results: {json.dumps(job_results.model_dump(), indent=2)}")

Job results: {
  "generations": [],
  "scores": {
    "simple-tool-calling-1": {
      "score_rows": [],
      "aggregated_results": {
        "created_at": "2025-11-06T22:41:32.282836",
        "updated_at": "2025-11-06T22:41:32.282837",
        "id": "evaluation_result-GojEP63Z9tWMtYHJZJq2vm",
        "job": "eval-BuCn79PVV4nus5o5icY8Wz",
        "tasks": {
          "custom-tool-calling": {
            "metrics": {
              "tool-calling-accuracy": {
                "scores": {
                  "function_name_accuracy": {
                    "value": 0.9,
                    "stats": {
                      "count": 50,
                      "sum": 45.0,
                      "mean": 0.9
                    }
                  },
                  "function_name_and_args_accuracy": {
                    "value": 0.68,
                    "stats": {
                      "count": 50,
                      "sum": 34.0,
                      "mean": 0.68
                    }
   

In [42]:
aggregated_results_custom = job_results.scores[benchmark_id].aggregated_results
custom_function_name_accuracy_score = aggregated_results_custom["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_accuracy"]["value"]
custom_function_name_and_args_accuracy = aggregated_results_custom["tasks"]["custom-tool-calling"]["metrics"]["tool-calling-accuracy"]["scores"]["function_name_and_args_accuracy"]["value"]

print(f"Custom model: function_name_accuracy: {custom_function_name_accuracy_score}")
print(f"Custom model: function_name_and_args_accuracy: {custom_function_name_and_args_accuracy}")

Custom model: function_name_accuracy: 0.9
Custom model: function_name_and_args_accuracy: 0.68
