In [1]:
import os
import json
import requests
import random
from time import sleep, time
from openai import OpenAI
import asyncio, nest_asyncio

from config import *

In [2]:
# API Key for NVIDIA provider (required even for self-hosted services)
os.environ["NVIDIA_API_KEY"] = NDS_TOKEN

# Metadata associated with Datasets and Customization Jobs
os.environ["NVIDIA_DATASET_NAMESPACE"] = NMS_NAMESPACE
os.environ["NVIDIA_PROJECT_ID"] = PROJECT_ID

## Inference env vars
os.environ["NVIDIA_BASE_URL"] = NIM_URL

# Data Store env vars
os.environ["NVIDIA_DATASETS_URL"] = ENTITY_STORE_URL

## Customizer env vars
os.environ["NVIDIA_CUSTOMIZER_URL"] = CUSTOMIZER_URL
os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = CUSTOMIZED_MODEL_DIR

# Evaluator env vars
os.environ["NVIDIA_EVALUATOR_URL"] = EVALUATOR_URL

# Guardrails env vars
os.environ["GUARDRAILS_SERVICE_URL"] = GUARDRAILS_URL


In [3]:
print(f"Data Store endpoint: {DATA_STORE_URL}")
print(f"Entity Store endpoint: {ENTITY_STORE_URL}")
print(f"Customizer endpoint: {CUSTOMIZER_URL}")
print(f"Evaluator endpoint: {EVALUATOR_URL}")
print(f"NIM endpoint: {NIM_URL}")
print(f"Namespace: {NMS_NAMESPACE}")
print(f"Base Model for Customization: {BASE_MODEL}")

Data Store endpoint: http://nemodatastore-sample.hacohen-nemo.svc.cluster.local:8000
Entity Store endpoint: http://nemoentitystore-sample.hacohen-nemo.svc.cluster.local:8000
Customizer endpoint: http://nemocustomizer-sample.hacohen-nemo.svc.cluster.local:8000
Evaluator endpoint: http://nemoevaluator-sample.hacohen-nemo.svc.cluster.local:8000
NIM endpoint: http://meta-llama3-1b-instruct.hacohen-nemo.svc.cluster.local:8000
Namespace: xlam-tutorial-ns
Base Model for Customization: meta/llama-3.2-1b-instruct


In [4]:
from llama_stack.core.library_client import LlamaStackAsLibraryClient

client = LlamaStackAsLibraryClient("nvidia")
client.initialize()

OTEL_EXPORTER_OTLP_ENDPOINT is not set, skipping telemetry


In [5]:
from llama_stack.apis.common.job_types import JobStatus
from llama_stack.core.datatypes import Api
import asyncio

def wait_customization_job(job_id: str, polling_interval: int = 30, timeout: int = 5500):
    start_time = time.time()
    
    # Access post_training through impls
    post_training = client.async_client.impls[Api.post_training]
    
    # Get initial status using async
    loop = asyncio.get_event_loop()
    res = loop.run_until_complete(post_training.get_training_job_status(job_uuid=job_id))
    job_status = res.status

    print(f"Waiting for Customization job {job_id} to finish.")
    print(f"Job status: {job_status} after {time.time() - start_time} seconds.")

    while job_status in [JobStatus.scheduled.value, JobStatus.in_progress.value]:
        sleep(polling_interval)
        res = loop.run_until_complete(post_training.get_training_job_status(job_uuid=job_id))
        job_status = res.status

        print(f"Job status: {job_status} after {time.time() - start_time} seconds.")

        if time.time() - start_time > timeout:
            raise RuntimeError(f"Customization Job {job_id} took more than {timeout} seconds.")

    return job_status


# When creating a customized model, NIM asynchronously loads the model in its model registry.
# After this, we can run inference with the new model. This helper function waits for NIM to pick up the new model.
def wait_nim_loads_customized_model(model_id: str, polling_interval: int = 10, timeout: int = 300):
    found = False
    start_time = time.time()

    print(f"Checking if NIM has loaded customized model {model_id}.")

    while not found:
        sleep(polling_interval)

        res = requests.get(f"{NIM_URL}/v1/models")
        if model_id in [model["id"] for model in res.json()["data"]]:
            found = True
            print(f"Model {model_id} available after {time.time() - start_time} seconds.")
            break
        else:
            print(f"Model {model_id} not available after {time.time() - start_time} seconds.")

    if not found:
        raise RuntimeError(f"Model {model_id} not available after {timeout} seconds.")

    assert found, f"Could not find model {model_id} in the list of available models."


In [6]:
repo_id = f"{NMS_NAMESPACE}/{DATASET_NAME}"
print(repo_id)

xlam-tutorial-ns/xlam-ft-dataset


In [7]:
response = client.datasets.register(
    purpose="post-training/messages",
    dataset_id=DATASET_NAME,
    source={
        "type": "uri",
        "uri": f"hf://datasets/{repo_id}"
    },
    metadata={
        "format": "json",
        "description": "Tool calling xLAM dataset in OpenAI ChatCompletions format",
        "provider_id": "nvidia"
    }
)
print(response)

DatasetRegisterResponse(identifier='xlam-ft-dataset', metadata={'format': 'json', 'description': 'Tool calling xLAM dataset in OpenAI ChatCompletions format', 'provider_id': 'nvidia'}, provider_id='nvidia', purpose='post-training/messages', source=SourceUriDataSource(type='uri', uri='hf://datasets/xlam-tutorial-ns/xlam-ft-dataset'), type='dataset', provider_resource_id='xlam-ft-dataset', owner=None)


In [8]:
res = requests.get(url=f"{ENTITY_STORE_URL}/v1/datasets/{NMS_NAMESPACE}/{DATASET_NAME}")
assert res.status_code in (200, 201), f"Status Code {res.status_code} Failed to fetch dataset {res.text}"
dataset_obj = res.json()

In [9]:
print("Files URL:", dataset_obj["files_url"])
assert dataset_obj["files_url"] == f"hf://datasets/{repo_id}"

Files URL: hf://datasets/xlam-tutorial-ns/xlam-ft-dataset


In [10]:
from llama_stack.apis.post_training import LoraFinetuningConfig
import inspect

print("LoraFinetuningConfig signature:")
print(inspect.signature(LoraFinetuningConfig))

if hasattr(LoraFinetuningConfig, 'model_fields'):
    print("\nLoraFinetuningConfig fields:")
    for field_name, field_info in LoraFinetuningConfig.model_fields.items():
        required = field_info.is_required()
        print(f"  {field_name}: required={required}, default={field_info.default}")


LoraFinetuningConfig signature:
(*, type: Literal['LoRA'] = 'LoRA', lora_attn_modules: list[str], apply_lora_to_mlp: bool, apply_lora_to_output: bool, rank: int, alpha: int, use_dora: bool | None = False, quantize_base: bool | None = False) -> None

LoraFinetuningConfig fields:
  type: required=False, default=LoRA
  lora_attn_modules: required=True, default=PydanticUndefined
  apply_lora_to_mlp: required=True, default=PydanticUndefined
  apply_lora_to_output: required=True, default=PydanticUndefined
  rank: required=True, default=PydanticUndefined
  alpha: required=True, default=PydanticUndefined
  use_dora: required=False, default=False
  quantize_base: required=False, default=False


In [11]:
import time
from llama_stack.core.datatypes import Api
from llama_stack.apis.post_training import (
    TrainingConfig,
    DataConfig,
    OptimizerConfig,
    LoraFinetuningConfig,
    DatasetFormat,
    OptimizerType,
)

unique_suffix = int(time.time())

# Access post_training through impls
post_training = client.async_client.impls[Api.post_training]

# Create proper config objects with all required fields
data_config = DataConfig(
    batch_size=16,
    dataset_id=DATASET_NAME,
    shuffle=True,
    data_format=DatasetFormat.instruct
)

optimizer_config = OptimizerConfig(
    optimizer_type=OptimizerType.adamw,
    lr=0.0001,
    weight_decay=0.01,
    num_warmup_steps=100
)

training_config = TrainingConfig(
    n_epochs=2,
    data_config=data_config,
    optimizer_config=optimizer_config
)

# LoRA configuration with correct fields
algorithm_config = LoraFinetuningConfig(
    lora_attn_modules=[],
    apply_lora_to_mlp=True,
    apply_lora_to_output=False,
    rank=8,
    alpha=16,
    use_dora=False,
    quantize_base=False
)

# Convert to dict to work around the bug
training_config_dict = training_config.model_dump()

# Now call the supervised_fine_tune method with dict
res = await post_training.supervised_fine_tune(
    job_uuid=f"finetune-{unique_suffix}",
    model="meta/llama-3.2-1b-instruct@v1.0.0+A100",
    training_config=training_config_dict,  # Pass as dict
    algorithm_config=algorithm_config,
    hyperparam_search_config=None,
    logger_config=None,
    checkpoint_dir="",
)
print(res)

  warn_unsupported_params(training_config, supported_params["training_config"], "TrainingConfig")
  warn_unsupported_params(training_config["data_config"], supported_params["data_config"], "DataConfig")
  warn_unsupported_params(
  warn_unsupported_params(algorithm_config, supported_params["lora_config"], "LoRA config")


job_uuid='cust-A9bozbaPwTSBXzFCjYVs3' status=<JobStatus.in_progress: 'in_progress'> created_at=datetime.datetime(2025, 11, 6, 21, 58, 14, 363803) updated_at=datetime.datetime(2025, 11, 6, 21, 58, 14, 363806) id='cust-A9bozbaPwTSBXzFCjYVs3' namespace='default' project='test-project' dataset='xlam-tutorial-ns/xlam-ft-dataset' output_model='nvidia-tool-calling-tutorial/test-llama-stack@v1' config='meta/llama-3.2-1b-instruct@v1.0.0+A100' hyperparameters={'finetuning_type': 'lora', 'training_type': 'sft', 'batch_size': 16, 'epochs': 2, 'learning_rate': 0.0001, 'weight_decay': 0.01, 'lora': {'adapter_dim': 8, 'alpha': 16, 'adapter_dropout': None, 'target_modules': None}, 'sequence_packing_enabled': False} status_details={'created_at': '2025-11-06T21:58:15.222066', 'updated_at': '2025-11-06T21:58:15.222066', 'elapsed_time': 0.0, 'steps_completed': 0, 'epochs_completed': 0, 'percentage_done': 0.0, 'status_logs': [{'updated_at': '2025-11-06T21:58:15.222066', 'message': 'created'}]} config_snaps

In [12]:
job = res.model_dump()

# To job track status
JOB_ID = job["id"]

# This will be the name of the model that will be used to send inference queries to
CUSTOMIZED_MODEL = job["output_model"]
print(JOB_ID)
print(CUSTOMIZED_MODEL)

cust-A9bozbaPwTSBXzFCjYVs3
nvidia-tool-calling-tutorial/test-llama-stack@v1


In [13]:
job_status = wait_customization_job(job_id=JOB_ID)

Waiting for Customization job cust-A9bozbaPwTSBXzFCjYVs3 to finish.
Job status: JobStatus.scheduled after 0.017461538314819336 seconds.


In [14]:
response = requests.get(f"{ENTITY_STORE_URL}/v1/models", params={"filter[namespace]": NMS_NAMESPACE, "sort" : "-created_at"})

assert response.status_code == 200, f"Status Code {response.status_code}: Request failed. Response: {response.text}"
print("Response JSON:", json.dumps(response.json(), indent=4))

Response JSON: {
    "object": "list",
    "data": [],
    "pagination": {
        "page": 1,
        "page_size": 1000,
        "current_page_size": 0,
        "total_pages": 0,
        "total_results": 0
    },
    "sort": "-created_at",
    "filter": {
        "namespace": "xlam-tutorial-ns"
    }
}


In [15]:
wait_nim_loads_customized_model(model_id=CUSTOMIZED_MODEL)

Checking if NIM has loaded customized model nvidia-tool-calling-tutorial/test-llama-stack@v1.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 10.008836269378662 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 20.014547109603882 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 30.019490718841553 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 40.024625301361084 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 50.029613733291626 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 60.03464388847351 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 70.04071593284607 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 80.04659008979797 seconds.
Model nvidia-tool-calling-tutorial/test-llama-stack@v1 not available after 90.0517487

In [16]:
# Verify the model is in NIM
resp = requests.get(f"{NIM_URL}/v1/models")
models = resp.json().get("data", [])
model_names = [model["id"] for model in models]

print("Available models in NIM:")
for name in model_names:
    print(f"  - {name}")

assert CUSTOMIZED_MODEL in model_names, f"Model {CUSTOMIZED_MODEL} not found"

Available models in NIM:
  - meta/llama-3.2-1b-instruct
  - nvidia-tool-calling-tutorial/test-llama-stack@v1


In [17]:
from llama_stack.core.datatypes import Api

# Get the NVIDIA inference provider directly
inference_router = client.async_client.impls[Api.inference]
nvidia_provider = inference_router.routing_table.impls_by_provider_id.get("nvidia")

if nvidia_provider:
    # Get fresh list of models from NVIDIA provider
    models_from_provider = await nvidia_provider.list_models()
    print("Models from NVIDIA provider:")
    for model in models_from_provider:
        print(f"  - {model.provider_resource_id}")
    
    # Now update the routing table with these models
    models_routing_table = client.async_client.impls[Api.models]
    await models_routing_table.update_registered_models(
        provider_id="nvidia",
        models=models_from_provider
    )
    print("\nRouting table updated!")
    
    # # Now try to register the model
    # from llama_stack.apis.models import ModelType
    # result = await models_routing_table.register_model(
    #     model_id=CUSTOMIZED_MODEL,
    #     model_type=ModelType.llm,
    #     provider_id="nvidia",
    #     provider_model_id=CUSTOMIZED_MODEL,
    # )
    # print("\nRegistration successful!")
    # print(result)
else:
    print("NVIDIA provider not found")


Models from NVIDIA provider:
  - meta/llama-3.2-1b-instruct
  - nvidia-tool-calling-tutorial/test-llama-stack@v1

Routing table updated!


In [18]:
# from llama_stack.apis.models.models import ModelType

# client.models.register(
#     model_id=CUSTOMIZED_MODEL,
#     model_type=ModelType.llm,
#     provider_id="nvidia",
# )

# Processed data will be stored here
DATA_ROOT = os.path.join(os.getcwd(), "sample_data")
CUSTOMIZATION_DATA_ROOT = os.path.join(DATA_ROOT, "customization")
VALIDATION_DATA_ROOT = os.path.join(DATA_ROOT, "validation")
EVALUATION_DATA_ROOT = os.path.join(DATA_ROOT, "evaluation")

os.makedirs(DATA_ROOT, exist_ok=True)
os.makedirs(CUSTOMIZATION_DATA_ROOT, exist_ok=True)
os.makedirs(VALIDATION_DATA_ROOT, exist_ok=True)
os.makedirs(EVALUATION_DATA_ROOT, exist_ok=True)

In [19]:
train_fp = f"{CUSTOMIZATION_DATA_ROOT}/training.jsonl"
assert os.path.exists(train_fp), f"The training data at '{train_fp}' does not exist. Please ensure that the data was prepared successfully."

val_fp = f"{VALIDATION_DATA_ROOT}/validation.jsonl"
assert os.path.exists(val_fp), f"The validation data at '{val_fp}' does not exist. Please ensure that the data was prepared successfully."

test_fp = f"{EVALUATION_DATA_ROOT}/xlam-test-single.jsonl"
assert os.path.exists(test_fp), f"The test data at '{test_fp}' does not exist. Please ensure that the data was prepared successfully."

In [20]:
def read_jsonl(file_path):
    """Reads a JSON Lines file and yields parsed JSON objects"""
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:
                continue  # Skip empty lines
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                continue


test_data = list(read_jsonl(test_fp))

print(f"There are {len(test_data)} examples in the test set")

There are 713 examples in the test set


In [21]:
 # Randomly choose
test_sample = random.choice(test_data)

# Transform tools to format expected by Llama Stack client
for i, tool in enumerate(test_sample['tools']):
    # Extract properties we will map to the expected format
    tool = tool.get('function', {})
    tool_name = tool.get('name')
    tool_description = tool.get('description')
    tool_params = tool.get('parameters', {})
    tool_params_properties = tool_params.get('properties', {})

    # Create object of parameters for this tool
    transformed_parameters = {}
    for name, property in tool_params_properties.items():
        transformed_param = {
            'param_type': property.get('type'),
            'description': property.get('description')
        }
        if 'default' in property:
            transformed_param['default'] = property['default']
        if 'required' in property:
            transformed_param['required'] = property['required']

        transformed_parameters[name] = transformed_param

    # Update this tool in-place using the expected format
    test_sample['tools'][i] = {
        'tool_name': tool_name,
        'description': tool_description,
        'parameters': transformed_parameters
    }

# Visualize the inputs to the LLM - user query and available tools
test_sample['messages']
test_sample['tools']


[{'tool_name': 'getpeople',
  'description': 'Fetches a list of artificial intelligence influencers, entrepreneurs, and top researchers from the specified API endpoint.',
  'parameters': {'page': {'param_type': 'integer',
    'description': 'The page number to retrieve.',
    'default': '1'}}},
 {'tool_name': 'movies_get_opening',
  'description': 'Fetches the list of opening movies for a given country using the Flixster API.',
  'parameters': {'countryid': {'param_type': 'string',
    'description': "The country code for which to fetch opening movies. Defaults to 'usa'. Examples include 'afg', 'alb', 'dza', etc.",
    'default': 'usa'}}},
 {'tool_name': 'overview',
  'description': 'Fetches summary statistics from the Papercliff API regarding the number of keywords found, and the number of articles and agencies reviewed.',
  'parameters': {'is_from': {'param_type': 'string',
    'description': "Narrows down the results to articles published after the provided date-time. The format sho

In [22]:
from llama_stack.core.datatypes import Api

# Use the registered model ID
REGISTERED_MODEL_ID = "nvidia/nvidia-tool-calling-tutorial/test-llama-stack@v1"

# Transform tools back to OpenAI format
openai_tools = []
for tool in test_sample['tools']:
    # Check if it's already in OpenAI format (has 'function' key)
    if 'function' in tool:
        openai_tools.append(tool)
    else:
        # Convert from Llama Stack format to OpenAI format
        openai_tool = {
            "type": "function",
            "function": {
                "name": tool.get('tool_name'),
                "description": tool.get('description'),
                "parameters": {
                    "type": "object",
                    "properties": {},
                    "required": []
                }
            }
        }
        # Convert parameters
        for param_name, param_info in tool.get('parameters', {}).items():
            openai_tool["function"]["parameters"]["properties"][param_name] = {
                "type": param_info.get('param_type'),
                "description": param_info.get('description', '')
            }
            if param_info.get('default') is not None:
                openai_tool["function"]["parameters"]["properties"][param_name]["default"] = param_info['default']
            if param_info.get('required', False):
                openai_tool["function"]["parameters"]["required"].append(param_name)
        
        openai_tools.append(openai_tool)

print("OpenAI formatted tools:")
print(openai_tools[0] if openai_tools else "No tools")

OpenAI formatted tools:
{'type': 'function', 'function': {'name': 'getpeople', 'description': 'Fetches a list of artificial intelligence influencers, entrepreneurs, and top researchers from the specified API endpoint.', 'parameters': {'type': 'object', 'properties': {'page': {'type': 'integer', 'description': 'The page number to retrieve.', 'default': '1'}}, 'required': []}}}


In [23]:
from llama_stack.core.datatypes import Api
from llama_stack.apis.inference import OpenAIChatCompletionRequestWithExtraBody

# Access inference through impls
inference = client.async_client.impls[Api.inference]

# Create request with OpenAI-formatted tools
request = OpenAIChatCompletionRequestWithExtraBody(
    model=REGISTERED_MODEL_ID,
    messages=test_sample["messages"],
    tools=openai_tools,  # Use the converted tools
    tool_choice="auto",
    stream=False,
    max_tokens=512,
    temperature=0.1,
    top_p=0.7,
)

# Make the chat completion call
completion = await inference.openai_chat_completion(params=request)

print("Tool calls from model:")
if hasattr(completion, 'choices') and len(completion.choices) > 0:
    print(completion.choices[0].message.tool_calls)
else:
    print(completion)


Tool calls from model:
[ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-13119f0fdcf649e0bf0a73499a42982a', function=Function(arguments='{"page": 2}', name='getpeople'), type='function')]


In [24]:
test_sample['tool_calls']

[{'type': 'function',
  'function': {'name': 'getpeople', 'arguments': {'page': 2}}}]

In [25]:
print(f"Name of your custom model is: {CUSTOMIZED_MODEL}")

Name of your custom model is: nvidia-tool-calling-tutorial/test-llama-stack@v1
