In [None]:
from datetime import timedelta
import json
import prometheus_api_client
from prometheus_api_client import PrometheusConnect
from prometheus_api_client.metric_range_df import MetricRangeDataFrame
from prometheus_api_client.metric_snapshot_df import MetricSnapshotDataFrame
from prometheus_api_client.metrics_list import MetricsList
from prometheus_api_client.utils import parse_datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## We connect to the Prometheus Metrics server which logs the data for the EAF's Triton Server

In [None]:
prom = PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)

This lists how many metrics are available, and in particular the ones for the GPU MIG instances

In [None]:
xxx = prom.all_metrics()
len(xxx)
[xx for xx in xxx if "GPU" in xx or "DCGM" in xx]

## Main Query Function
This uses the PromQL language to get metrics recorded between the first and last timestamp. To prevent overloading the prometheus server,
the timestamps should be broken up into pairs that are on the order of hours to a day or two (so to get a week's worth of logs, do 7 1-day increments as a ```list[(t0, t1), (t1, t2), (t2, t3), (t3, t4), (t4, t5), (t5, t6)]```
These will be concatenated so that the returned results dictionary contains, for each query, a single pandas datafram for (t0, t6)

An importan caveat about prometheus metrics are the disparate collection and timing aspects. Each metric tends to log one figure of merit, with many labels for association to a particular task or resource type. As an example, the number of inferences computed to this point may be logged (with associated timestamp), but the nearly-coincident inference request duration may happen slightly sooner, later, or not at all. This point is important, as missing metrics are not rare. Many of these are logged at the EAF with a frequency of 15s or 30s. The prometheus developers recommend aggregating metris in a time-window 4x larger than the collection frequency. As such, 60s or 120s should be chosen as the ```step``` value in the function to avoid noisy and missing data.

In [None]:
# A function for getting queries of many GPU and Triton server metrics. Inputs are a list of timestamp tuples,
# which can be parsed by the prometheus_api_client.utils.parse_datetime function. This can understand timestamps formatted like
# "2023-03-30 at 16:00:00 MDT"
# The step is the 'time-window' over which each query will be divided. This should be ~4x as long as the longest frequency for metric-gather
def get_all_queries(timestamp_tuples, step):
    # A dictionary for our results
    results = {}
    # Tuples of the queries we'll make, for debugging and info
    queries = []
    
    # Some queries are best created after understanding which unique models+version have been run in the triton servers
    # and which GPU instances have been active. These are then used to formulate model/version-specific and GPU-specific stats
    unique_model_versions = None
    unique_gpu_instances = None
    
    #Basic queries. Some of them are used as proxies to figure out the unqique queries to make later, like the "gpu_tensor_util" below
    for key, query in {
        "num_instances": "count((sum by(pod) (delta(nv_inference_request_success["+step+"]))) > 0)",
        "inf_rate_net":"sum (rate(nv_inference_count["+step+"]))",
        "inf_rate_bypod":"sum by(pod) (rate(nv_inference_count["+step+"]))",
        "inf_rate":"sum by(model, version, pod) (rate(nv_inference_count["+step+"]))",
        "inf_cache_hit_rate":"sum by(model, version, pod) (rate(nv_cache_num_hits_per_model["+step+"]))",
        "inf_reqs_net":"sum(rate(nv_inference_request_success["+step+"]))",
        "inf_reqs_bypod":"sum by(pod) (rate(nv_inference_request_success["+step+"]))",
        "inf_reqs":"sum by(model, version, pod) (rate(nv_inference_request_success["+step+"]))",
        "inf_req_dur_net": "avg (delta(nv_inference_request_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_req_dur": "avg by(model, version, pod) (delta(nv_inference_request_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_que_dur": "avg by(model, version, pod) (delta(nv_inference_queue_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inp_dur": "avg by(model, version, pod) (delta(nv_inference_compute_input_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_inf_dur": "avg by(model, version, pod) (delta(nv_inference_compute_infer_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "inf_out_dur": "avg by(model, version, pod) (delta(nv_inference_compute_output_duration_us["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        "gpu_tensor_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        "gpu_dram_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        #"inf_cache_hits": "avg by(model, version, pod) (delta(nv_cache_num_hits_per_model["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        }.items():
        # Build an empty list for these results; after iterating through all the timestamp pairs, they'll be concatenated together
        results[key] = []
        # Log the queries, as they're easier to parse after being resolved fully
        queries.append((key, query))
        # This function executes a query for each timestamp pair, for each key:query
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            # Queries are converted to a pandas dataframe
            df = MetricRangeDataFrame(test_inp)
            results[key].append(df)
        # Dataframes are concatenated together along the time (index value) axis
        results[key] = pd.concat(results[key], axis=0)
        
        # If we've performed a query that stores model/version info and GPU instance info, respectively, we can 
        # Create a set of unique ones for the next two sets of queries
        if unique_model_versions is None and hasattr(results[key], "model") and hasattr(results[key], "version"):
            unique_model_versions = set((results[key].model+"/"+results[key].version).values)
        # At the EAF, the device ('nvidiaX' where X is 0...4 for example), GPU instance ID (enumeration)
        # and the instance (IP address of host machine) are sufficient to make a unique identifier
        if unique_gpu_instances is None and hasattr(results[key], "GPU_I_ID"):
            unique_gpu_instances = set((results[key].device+"/"+results[key].GPU_I_ID+"/"+results[key].instance).values)
    # Here we build the model-specific queries, getting both the number of unique number of Triton instances that served 
    # inference requests for this model, ad well as the inference rate of that model across all Triton instances active per time step
    model_queries = {"num_instances_"+model_version: "count((sum by(pod) (delta(nv_inference_request_success{model='"+
                     model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'}["+step+"]))) > 0)"
                     for model_version in unique_model_versions}
    model_queries.update(
        {"inf_rate_"+model_version: "sum (rate(nv_inference_count{model='"+
         model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'}["+step+"]))"
         for model_version in unique_model_versions})
    for key, query in model_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            # The query could be empty, as a model only served in a portion of the total timerange could be inactive in some timestamp-pairs.
            # We will deal with broadcasting these dataframes with missing values later
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
        else:
            # If somehow we got no results for this model query, remove it from the dictionary and avoid iterating over it later
            results.pop(key)
            unique_model_versions.remove(key.split("_instances_")[1])
            
    # Now we gather the GPU metrics. The two most interesting ones for us are the DCGM_FI_PROF_PIPE_TENSOR_ACTIVE and 
    # DCGM_FI_PROF_DRAM_ACTIVE. The former measures how much of the compute resources (the Tensor Cores) are active, on average, in a time period
    # If the utilization is 50%, this could mean that the tensor cores for this GPU (slice) are 100% active for 50% of the time, 50% active for
    # 100% of the time, or any combination of activity_percent * time_active_percent that gives that product.
    gpu_queries = {"gpu_tensor_util_"+str(mg): "sum (avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{"+
                   "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
                   "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))" for mg, gpu_inst in enumerate(unique_gpu_instances)}
    # An example of how additional labels can filter out non-matching queries, if we do 
    # DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',
    #                          device='nvidia2',GPU_I_ID='3',instance='110.4.29.45'}[120s]
    # We'll only get metrics from that specific device, if it has a running instance with that IP, and a running GPU instance matching it
    # In this case, for each timestep, it'll get a 'vector' of instantaenous measurements within 120s
    # The avg_over_time function then measures the average over time of that 'vector' and produces a scalar result
    # The scalar result may not be unique for a given timestamp, there can be other labels attached, and a final avg is taken over all
    # of those
    gpu_queries.update(
        {"gpu_dram_util_"+str(mg): "avg (avg_over_time(DCGM_FI_PROF_DRAM_ACTIVE{"+
         "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
        "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))"
         for mg, gpu_inst in enumerate(unique_gpu_instances)})
    for key, query in gpu_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
            #print(key)
        else:
            #print(f"results empty for {key}")
            results.pop(key)
            unique_gpu_instances.remove(key.split("_util_")[1])
    return results, queries, unique_model_versions, unique_gpu_instances

In addition to the query dataframes, a list of the key:query pairs and the different model/versions active, and GPU MIG slices active, will be recorded

In [None]:
results, queries, unique_model_versions, unique_gpu_instances = get_all_queries([("2023-03-30 at 16:00:00 MDT", "2023-03-30 at 19:00:00 MDT"),], step="60s")

In [None]:
unique_model_versions, unique_gpu_instances

An example result, the average percent utilization of the 1st GPU tensor pipe

In [None]:
results["gpu_tensor_util_0"]

## Converting to a unified dataframe
This function takes the subset of results that can be concatenated into new columns,
including the inference rate (with breakdownds by model), the timing of the inference request (overall request time, as well as broken down into queue time, input time, compute time, and output time), and the GPU dram and tensor utilization. See NVidia docs for more information on how these quantities are calculated and stored

In [None]:
def convert_results_to_df(results):
    # This iteratively walks through some of the dataframes that are compatible and aggregates results into a 
    # unified dataframe. In each dataframe, the join call, in combination with how='left', means that results are broadcast
    # and filled with NaN wherever results may be missing from the second of the two dataframes.
    # For this reason, the 'inf_rate_net' which should have a valid value for all timestamps is used as the base.
    i0 = results["inf_rate_net"].join(results["num_instances"],
                                      how="left", 
                                      rsuffix="_num_instances",
                                     )
    # We use the rsuffix and lsuffix to convert column names from 'value' to one that is understandable/parseable later on. 
    i0 = i0.join(results["inf_reqs_net"],
                 how="left",
                 rsuffix="_inf_reqs_net")
    i0 = i0.join(results["inf_req_dur_net"],
                 how="left",
                 rsuffix="_inf_req_dur_net")
    i0 = i0.join(results["inf_que_dur_net"],
                 how="left",
                 rsuffix="_inf_que_dur_net")
    i0 = i0.join(results["inf_inp_dur_net"],
                 how="left",
                 rsuffix="_inf_inp_dur_net")
    i0 = i0.join(results["inf_inf_dur_net"],
                 how="left",
                 rsuffix="_inf_inf_dur_net")
    i0 = i0.join(results["inf_out_dur_net"],
                 how="left",
                 rsuffix="_inf_out_dur_net")
    
    #Add the model metrics, using some suffix parsing to make it into num_instances_X or rate_X where X is the model name
    for model in unique_model_versions:
        itemp = results["inf_rate_" + model].join(results["num_instances_" + model],
                                                  how="left",
                                                  rsuffix="_num_instances_"+model.split("/")[0],
                                                  lsuffix="_rate_"+model.split("/")[0],
                                                 )
        i0 = i0.join(itemp, how="left")
        
    #Add the GPU Instance metrics, including GPU instance enumeration
    for mg, gpu in enumerate(unique_gpu_instances):
        results["gpu_tensor_util_" + str(mg)].fillna(0, inplace=True)
        results["gpu_dram_util_" + str(mg)].fillna(0, inplace=True)
        itemp = results["gpu_tensor_util_" + str(mg)].join(results["gpu_dram_util_" + str(mg)],
                                                  how="left",
                                                  rsuffix="_gpu_dram_util_"+str(mg),
                                                  lsuffix="_gpu_tensor_util_"+str(mg),
                                                 )
        i0 = i0.join(itemp, how="left")

    #Get rid of the "value" in column names, and fill NaN values with 0 everywhere
    i0.rename(columns={"value": "rate"}, inplace=True)
    i0.rename(columns={col:col[6:] for col in i0.columns if col.startswith("value_")}, inplace=True)
    i0.fillna(0, inplace=True)
    
    # Aggregate some stats for models
    # The summed rate and total inference rate should match, otherwise we've double-counted something
    # The summed instances may NOT match: if a model is active on 5 of 10 servers in a timestep, and another is active on 7 of 10
    # Then there will be '12' active instances in that timestep, net. This number divided by the net_instances
    # Therefore gives a measure of the 'average' model concurrency in a timestep. 10 net_instances and 70 summed_intstances
    # would indicate each instances was serving 7 models at some point in that timestep (but this is a lossy gathering of information,
    # 6 models could do one inference request while the last model is responsible for all of the remainder of thousands of requests.
    valid_model_keys = [col for col in i0.columns if col.startswith("rate_") and col.replace("rate_", "num_instances_") in i0.columns]
    i0["summed_rate"] = sum([i0[col] for col in valid_model_keys])
    i0["summed_instances"] = sum([i0[col.replace("rate_", "num_instances_")] for col in valid_model_keys])
    
    # Aggregate some stats for GPU instances
    valid_gpu_keys = [col for col in i0.columns if col.startswith("gpu_tensor_util") and col.replace("tensor", "dram") in i0.columns]
    i0["summed_gpu_tensor_util"] = sum([i0[col] for col in valid_gpu_keys])
    i0["summed_gpu_dram_util"] = sum([i0[col.replace("tensor", "dram")] for col in valid_model_keys])
    return i0

In [None]:
i0 = convert_results_to_df(results)
i0

In [None]:
# Save results in a pickle file for later
import pickle
with open(f"triton_metrics_test.pickle", "wb") as output_file:
    pickle.dump(i0, output_file)

## A few simple plots

In [None]:
plt.plot(i0.index.values, i0.rate.values)
scale_value = max(i0.rate.values)/max(i0.summed_gpu_tensor_util)
plt.plot(i0.index.values, i0.summed_gpu_tensor_util.values*scale_value, color="tab:red")

In [None]:
# Plot the rate versus number of instances, where at least 1 active instance is serving results
plt.scatter("num_instances", "rate", data=i0[i0.num_instances > 0], color="tab:red")

In [None]:
# Plot the same thing, but specific to the pn_demo model
plt.scatter("num_instances_pn_demo", "rate_pn_demo", data=i0[i0.num_instances > 0], color="tab:blue")

In [None]:
plt.scatter("num_instances_svj_tch_gnn", "rate_svj_tch_gnn", data=i0[i0.num_instances > 0], color="tab:green")

## Concurrency
How can we measure how many models are active per Triton server? The ```num_instances``` is how many actives servers there are.
The variables ```summed_instances``` is the sum of each model's active ```num_instances```. If the values are equal, then concurrency is low
(when defined as the number of ML models being run on an individual server). If ```summed_instances >> num_instances```, that indicates that each triton server is tending to actively serve requests from multiple models in a given timespan

In [None]:
#Concurrency question: if models tend to gravitate to their own instances, summed instances ~ num_instances
#If concurrency is as high as possible, summed instances ~ avg_num_models * num_instances
ii = i0[i0.num_instances > 0].summed_instances/i0[i0.num_instances > 0].num_instances
print(np.mean(ii), np.max(ii), np.min(ii))
print(np.sqrt(np.var(ii)))

#Consistency check: summed rate should always add to net rate!
kk = i0[i0.num_instances > 0].summed_rate/i0[i0.num_instances > 0].rate
print(np.mean(kk), np.max(kk), np.min(kk))
print(np.sqrt(np.var(kk)))