In [1]:
from typing import TYPE_CHECKING


if TYPE_CHECKING:
    from math_rag.application.containers import ApplicationContainer
    from math_rag.infrastructure.containers import InfrastructureContainer

    application_container: ApplicationContainer
    infrastructure_container: InfrastructureContainer

In [2]:
RESET = False
%load_ext hooks.notebook_hook

2025-06-26 20:25:24,449 - INFO - datasets - config.py:54 - PyTorch version 2.6.0 available.


In [3]:
hpc_client = infrastructure_container.hpc_client()
pbs_pro_client = infrastructure_container.pbs_pro_client()

In [4]:
gpu_stats = await hpc_client.gpu_statistics()

2025-06-26 20:25:29,823 - INFO - math_rag.infrastructure.clients.ssh_client - ssh_client.py:53 - Command `gpustat | awk 'NR>=3 {print $1"_"$2"_"$3"_"$4"_"$5"_"$6"_"$7}'` in `run` returned stdout: 561561_x8000c2s4b0n1_GPU-0_0_%_5_MiB
x8000c2s4b0n1_GPU-1_0_%_5_MiB_


In [5]:
for entry in gpu_stats.entries:
    print(f'job_id={entry.job_id}')

    for sub_entry in entry.sub_entries:
        print(f'    {sub_entry}')

job_id=561561
    node='x8000c2s4b0n1' gpu='GPU-0' used_percent=0 mem_used=5242880
    node='x8000c2s4b0n1' gpu='GPU-1' used_percent=0 mem_used=5242880


In [6]:
pushgateway_client = infrastructure_container.pushgateway_client()
prometheus_hpc_admin_client = infrastructure_container.prometheus_hpc_admin_client()

pushgateway_base_url = infrastructure_container.config.pushgateway.base_url.provided()

In [7]:
PUSHGATEWAY_JOB = 'pbs_monitor'
job_id = '561561'

In [8]:
import asyncio

from prometheus_client import CollectorRegistry, Gauge, push_to_gateway


registry = CollectorRegistry()
cpu_percent_gauge = Gauge(
    'pbs_job_cpu_percent', 'CPU percentage of PBS job', ['job_id'], registry=registry
)
cpu_time_gauge = Gauge(
    'pbs_job_cpu_seconds_total',
    'Total CPU time of PBS job in seconds',
    ['job_id'],
    registry=registry,
)
num_cpus_gauge = Gauge(
    'pbs_job_num_cpus', 'Number of CPUs allocated to PBS job', ['job_id'], registry=registry
)
mem_bytes_gauge = Gauge(
    'pbs_job_mem_bytes', 'Resident memory bytes of PBS job', ['job_id'], registry=registry
)
vmem_bytes_gauge = Gauge(
    'pbs_job_vmem_bytes', 'Virtual memory bytes of PBS job', ['job_id'], registry=registry
)
wall_time_gauge = Gauge(
    'pbs_job_wall_seconds_total',
    'Wall clock time of PBS job in seconds',
    ['job_id'],
    registry=registry,
)


async def push_resources():
    while True:
        pbs_pro_job_full = await pbs_pro_client.queue_status_full(job_id)
        ru = pbs_pro_job_full.resources_used

        cpu_percent_gauge.labels(job_id=job_id).set(ru.cpu_percent)
        cpu_time_gauge.labels(job_id=job_id).set(ru.cpu_time.total_seconds())
        num_cpus_gauge.labels(job_id=job_id).set(ru.num_cpus)
        mem_bytes_gauge.labels(job_id=job_id).set(ru.mem)
        vmem_bytes_gauge.labels(job_id=job_id).set(ru.vmem)
        wall_time_gauge.labels(job_id=job_id).set(ru.wall_time.total_seconds())

        push_to_gateway(pushgateway_base_url, job=job_id, registry=registry)
        await asyncio.sleep(10)

In [None]:
await push_resources()

In [12]:
import asyncio

from prometheus_client import CollectorRegistry, Gauge, push_to_gateway


registry = CollectorRegistry()
gpu_util_percent_gauge = Gauge(
    'pbs_job_gpu_util_percent',
    'GPU utilization percent of PBS job',
    ['job_id', 'node', 'gpu'],
    registry=registry,
)
gpu_mem_bytes_gauge = Gauge(
    'pbs_job_gpu_mem_bytes',
    'GPU memory used (bytes) of PBS job',
    ['job_id', 'node', 'gpu'],
    registry=registry,
)


async def push_gpu_stats():
    prev_labels: set[tuple[str, str, str]] = set()
    while True:
        gpu_stats = await hpc_client.gpu_statistics()

        # collect current label sets
        current_labels: set[tuple[str, str, str]] = set()
        for entry in gpu_stats.entries:
            jid = str(entry.job_id)
            for sub in entry.sub_entries:
                current_labels.add((jid, sub.node, sub.gpu))

        # remove metrics for labels that disappeared
        for jid, node, gpu in prev_labels - current_labels:
            gpu_util_percent_gauge.remove(job_id=jid, node=node, gpu=gpu)
            gpu_mem_bytes_gauge.remove(job_id=jid, node=node, gpu=gpu)

        # update prev_labels
        prev_labels = current_labels

        # set new metrics
        for entry in gpu_stats.entries:
            jid = str(entry.job_id)
            for sub in entry.sub_entries:
                gpu_util_percent_gauge.labels(job_id=jid, node=sub.node, gpu=sub.gpu).set(
                    sub.used_percent
                )
                gpu_mem_bytes_gauge.labels(job_id=jid, node=sub.node, gpu=sub.gpu).set(sub.mem_used)

        # push all at once
        push_to_gateway(pushgateway_base_url, job='gpu_stats', registry=registry)

        await asyncio.sleep(10)

In [None]:
await push_gpu_stats()

In [8]:
await pushgateway_client.delete_job(PUSHGATEWAY_JOB)

matchers = [f'pbs_job_cpu_percent{{job_id="{job_id}"}}', f'pbs_job_mem_bytes{{job_id="{job_id}"}}']
await prometheus_hpc_admin_client.delete_series(matchers)