In [None]:
from datetime import timedelta, datetime
import time
import json
import prometheus_api_client
import hist
import copy
from prometheus_api_client import PrometheusConnect
from prometheus_api_client.metric_range_df import MetricRangeDataFrame
from prometheus_api_client.metric_snapshot_df import MetricSnapshotDataFrame
from prometheus_api_client.metrics_list import MetricsList
from prometheus_api_client.utils import parse_datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rich.progress import track

## We connect to the Prometheus Metrics server which logs the data for the EAF's Triton Server

In [None]:
prom = PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)

This lists how many metrics are available, and in particular the ones for the GPU MIG instances

In [None]:
xxx = prom.all_metrics()
len(xxx)
[xx for xx in xxx if "GPU" in xx or "DCGM" in xx]

## Main Query Function
This uses the PromQL language to get metrics recorded between the first and last timestamp. To prevent overloading the prometheus server,
the timestamps should be broken up into pairs that are on the order of hours to a day or two (so to get a week's worth of logs, do 7 1-day increments as a ```list[(t0, t1), (t1, t2), (t2, t3), (t3, t4), (t4, t5), (t5, t6)]```
These will be concatenated so that the returned results dictionary contains, for each query, a single pandas datafram for (t0, t6)

An importan caveat about prometheus metrics are the disparate collection and timing aspects. Each metric tends to log one figure of merit, with many labels for association to a particular task or resource type. As an example, the number of inferences computed to this point may be logged (with associated timestamp), but the nearly-coincident inference request duration may happen slightly sooner, later, or not at all. This point is important, as missing metrics are not rare. Many of these are logged at the EAF with a frequency of 15s or 30s. The prometheus developers recommend aggregating metris in a time-window 4x larger than the collection frequency. As such, 60s or 120s should be chosen as the ```step``` value in the function to avoid noisy and missing data.

In [None]:
x = None
import pickle
first = None
last = None

In [None]:
with open("UnitaryClient_88parcels_trial1_benchmark00.pickle", "rb") as pf:
    x = pickle.load(pf)
    all_logs = []
    all_z = []
    for xx in x:
        all_logs += xx['worklogs']
        z = [xxx.hostname + str(xxx.pid) for xxx in xx['worklogs']]
        dts_s = min([xxx.start_time for xxx in xx['worklogs']])
        dts_e = max([xxx.end_time for xxx in xx['worklogs']])
        #first = dts_s
        #last = dts_e
        #print(dts_s, dts_e)
        #print(parse_datetime(dts_s), parse_datetime(dts_e))
        #print(set(z))
        #all_z += z
    #print(len(set(all_z)))

In [None]:
parse_datetime("2023-04-03 at 21:12:07 MDT") == parse_datetime("2023-04-03 at 22:12:07 CDT")

# datetimeff = get_all_queries([(datetime.fromtimestamp(time.mktime(first)), datetime.fromtimestamp(time.mktime(last)))], "120s")
'dst',
 'fold',
 'fromisocalendar',
 'fromisoformat',
 'fromordinal',
 'fromtimestamp',
 'hour',
 'isocalendar',
 'isoformat',
 'isoweekday',
 'max',
 'microsecond',
 'min',
 'minute',
 'month',
 'now',
 'replace',
 'resolution',
 'second',
 'strftime',
 'strptime',
 'time',
 'timestamp',
 'timetuple',
 'timetz',
 'today',
 'toordinal',
 'tzinfo',
 'tzname',
 'utcfromtimestamp',
 'utcnow',
 'utcoffset',
 'utctimetuple',
 'weekday',
 'year']
# time_struct 
 'count',
 'index',
 'n_fields',
 'n_sequence_fields',
 'n_unnamed_fields',
 'tm_gmtoff',
 'tm_hour',
 'tm_isdst',
 'tm_mday',
 'tm_min',
 'tm_mon',
 'tm_sec',
 'tm_wday',
 'tm_yday',
 'tm_year',
 'tm_zone'

# namespace
'triton-nick'

In [None]:
def run_single_query(timestamp_tuples, 
                     query, 
                     step="120s", 
                     namespace='triton',
                     deduplicate=False,
                     prom=PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)):
    
    results = []
    errors = []
    print(f"Running Query: {query}")
    for st, et in track(timestamp_tuples, description = f"Retrieving - "):
        test_inp = prom.custom_query_range(
            query=query,
            start_time=parse_datetime(st),
            end_time=parse_datetime(et),
            step=step
        )
        # Queries are converted to a pandas dataframe
        try:
            df = MetricRangeDataFrame(test_inp)
            results.append(df)
        except:
            if isinstance(test_inp, list) and len(test_inp) == 0:
                pass
            else:
                errors.append(test_inp)
    # Dataframes are concatenated together along the time (index value) axis
    if len(results) > 0:
        temp = pd.concat(results, axis=0)
        # Remove duplicates
        ## return temp, errors
        if deduplicate:
            temp = temp[~temp.index.duplicated(keep='first')]
        return temp, errors
    else:
        return None, None
    
def get_all_queries_v2(timestamp_tuples, step="120s", namespace='triton', 
                       prom=PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)):
    # FIXME: Refactor this into a single-query function (plus, a model_version single-query function, and another for the GPU stats)
    # Then make calls to those function(s) with a wrapping function containing these queries.
    rs = ""
    rsm = ""
    if isinstance(namespace, str):
        rs = "{namespace='"+namespace+"'}"
        rsm = ",namespace='"+namespace+"'"
    # A dictionary for our results
    results = {}
    errors = {}
    # Tuples of the queries we'll make, for debugging and info
    queries = []
    
    # Some queries are best created after understanding which unique models+version have been run in the triton servers
    # and which GPU instances have been active. These are then used to formulate model/version-specific and GPU-specific stats
    unique_model_versions, inactive_model_versions = find_active_models(timestamp_tuples, namespace=namespace, prom=prom)
    unique_gpu_instances = None
    
    #Basic queries. Some of them are used as proxies to figure out the unqique queries to make later, like the "gpu_tensor_util" below
    for key, query in {
        "num_instances": "count((sum by(pod) (delta(nv_inference_request_success"+rs+"["+step+"]))) > 0)",
        "inf_rate_net":"sum (rate(nv_inference_count"+rs+"["+step+"]))",
        "inf_rate_bypod":"sum by(pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        ##"inf_rate":"sum by(model, version, pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        #"inf_cache_hit_rate":"sum by(model, version, pod) (rate(nv_cache_num_hits_per_model"+rs+"["+step+"]))",
        "inf_reqs_net":"sum(rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_reqs_bypod":"sum by(pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_reqs":"sum by(model, version, pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_req_dur_net": "avg (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"inf_req_dur": "avg by(model, version, pod) (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"inf_que_dur": "avg by(model, version, pod) (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"inf_inp_dur": "avg by(model, version, pod) (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"inf_inf_dur": "avg by(model, version, pod) (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"inf_out_dur": "avg by(model, version, pod) (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "gpu_tensor_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        "gpu_dram_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        #"inf_cache_hits": "avg by(model, version, pod) (delta(nv_cache_num_hits_per_model["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        }.items():
        # Log the queries, as they're easier to parse after being resolved fully
        queries.append((key, query))
        # Dataframes are concatenated together along the time (index value) axis
        results[key], errors[key] = run_single_query(timestamp_tuples, query, step=step, prom=prom)
        
        # If we've performed a query that stores model/version info and GPU instance info, respectively, we can 
        # Create a set of unique ones for the next two sets of queries
        #if unique_model_versions is None and not isinstance(results[key], list) and hasattr(results[key], "model") and hasattr(results[key], "version"):
        #    unique_model_versions = set((results[key].model+"/"+results[key].version).values)
        # At the EAF, the device ('nvidiaX' where X is 0...4 for example), GPU instance ID (enumeration)
        # and the instance (IP address of host machine) are sufficient to make a unique identifier
        if unique_gpu_instances is None and not isinstance(results[key], list) and hasattr(results[key], "GPU_I_ID"):
            unique_gpu_instances = set((results[key].device+"/"+results[key].GPU_I_ID+"/"+results[key].instance).values)
    # Here we build the model-specific queries, getting both the number of unique number of Triton instances that served 
    # inference requests for this model, ad well as the inference rate of that model across all Triton instances active per time step
    if unique_model_versions is not None:
        model_queries = {"num_instances_"+model_version: "count((sum by(pod) (delta(nv_inference_request_success{model='"+
                         model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))) > 0)"
                         for model_version in unique_model_versions}
        model_queries.update(
            {"inf_rate_"+model_version: "sum (rate(nv_inference_count{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))"
             for model_version in unique_model_versions})
        for key, query in model_queries.items():
            queries.append((key, query))
            results[key], errors[key] = run_single_query(timestamp_tuples, query, step=step, prom=prom)
            if results[key] is None:
                # If somehow we got no results for this model query, remove it from the dictionary and avoid iterating over it later
                try:
                    results.pop(key)
                    unique_model_versions.remove(key.replace("inf_rate_", "").replace("num_instances_", ""))
                except:
                    pass

    # Now we gather the GPU metrics. The two most interesting ones for us are the DCGM_FI_PROF_PIPE_TENSOR_ACTIVE and 
    # DCGM_FI_PROF_DRAM_ACTIVE. The former measures how much of the compute resources (the Tensor Cores) are active, on average, in a time period
    # If the utilization is 50%, this could mean that the tensor cores for this GPU (slice) are 100% active for 50% of the time, 50% active for
    # 100% of the time, or any combination of activity_percent * time_active_percent that gives that product.
    if unique_gpu_instances is not None:
        gpu_queries = {"gpu_tensor_util_"+str(mg): "sum (avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{"+
                       "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
                       "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))" for mg, gpu_inst in enumerate(unique_gpu_instances)}
        # An example of how additional labels can filter out non-matching queries, if we do 
        # DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',
        #                          device='nvidia2',GPU_I_ID='3',instance='110.4.29.45'}[120s]
        # We'll only get metrics from that specific device, if it has a running instance with that IP, and a running GPU instance matching it
        # In this case, for each timestep, it'll get a 'vector' of instantaenous measurements within 120s
        # The avg_over_time function then measures the average over time of that 'vector' and produces a scalar result
        # The scalar result may not be unique for a given timestamp, there can be other labels attached, and a final avg is taken over all
        # of those
        gpu_queries.update(
            {"gpu_dram_util_"+str(mg): "avg (avg_over_time(DCGM_FI_PROF_DRAM_ACTIVE{"+
             "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
            "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))"
             for mg, gpu_inst in enumerate(unique_gpu_instances)})
        for key, query in gpu_queries.items():
            queries.append((key, query))
            results[key], errors[key] = run_single_query(timestamp_tuples, query, step=step, prom=prom)
            if results[key] is None:
                #print(f"results empty for {key}")
                try:
                    results.pop(key)
                    unique_gpu_instances.remove(key.split("_util_")[1])
                except:
                    pass
    return results, errors, queries, unique_model_versions, unique_gpu_instances

In [None]:
def prom_query_hash(query_result):
    """Return a string-key to hash the result of a query, based on the labels Prometheus attaches"""
    metric_dict = query_result['metric']
    key = ""
    for k, v in metric_dict.items():
        key += "($)" +  k + "::" + v
    return key

def prom_query_add(query_A, query_B):
    result = {}
    result['metric'] = copy.deepcopy(query_A['metric'])
    hash_A = prom_query_hash(query_A)
    hash_B = prom_query_hash(query_B)
    assert hash_A == hash_B, f"Incompatible metrics are being added: {query_A['metric']} |INCOMPATIBLE WITH| {query_B['metric']}"
    result['values'] = copy.deepcopy(query_A['values'])
    result['values'] += copy.deepcopy(query_B['values'])
    return result

def single_query_splt(timestamp_tuples, 
                      query, 
                      step="120s", 
                      namespace='triton',
                      deduplicate=False,
                      dataframe_mode="individual", #"unified", "individual", "naive"
                      prom=None):
    if prom is None:
        prom = PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)
    results_dict = {}
    errors = []
    print(f"Running Query: {query}")
    for st, et in track(timestamp_tuples, description = f"Retrieving"):
        test_inp = prom.custom_query_range(
            query=query,
            start_time=parse_datetime(st),
            end_time=parse_datetime(et),
            step=step
        )
        for query_result in test_inp:
            key = prom_query_hash(query_result)
            if key not in results_dict:
                results_dict[key] = query_result
            else:
                results_dict[key] = prom_query_add(results_dict[key], query_result)
            
    # Queries are converted to a pandas dataframe
    if dataframe_mode.lower() == "individual":
        results = []
        for key in results_dict:
            try:
                df = MetricRangeDataFrame(results_dict[key])
                if deduplicate:
                    df = df[~df.index.duplicated(keep='first')]
                results.append(df)
            except:
                errors.append({key: results_dict[key]})
    elif dataframe_mode.lower() == "bypass":
        results = list(results_dict.values())
    else:
        try:
            df = MetricRangeDataFrame(list(results_dict.values()))
            if deduplicate:
                df = df[~df.index.duplicated(keep='first')]
            results = [df]
        except:
            errors.append(results_dict)
            
    if len(results) > 0:
        temp = results
        return temp, errors
    else:
        return None, None

def get_all_queries_v3(timestamp_tuples, step="120s", granular_step=None, namespace='triton', deduplicate=False,
                       prom=PrometheusConnect(url="http://lsdataitb.fnal.gov:9009/prometheus", disable_ssl=True)):
    rs = ""
    rsm = ""
    if isinstance(namespace, str):
        rs = "{namespace='"+namespace+"'}"
        rsm = ",namespace='"+namespace+"'"
    # A dictionary for our results
    results = {}
    errors = {}
    # Tuples of the queries we'll make, for debugging and info
    queries = []
    
    # Some queries are best created after understanding which unique models+version have been run in the triton servers
    # and which GPU instances have been active. These are then used to formulate model/version-specific and GPU-specific stats
    columns_step = granular_step if granular_step is not None else step
    unique_model_versions, inactive_model_versions = find_active_models(timestamp_tuples, step=columns_step, namespace=namespace, prom=prom)
    unique_gpu_instances = find_all_gpus(timestamp_tuples, step=columns_step, namespace=None, prom=prom) #different namespace entirely
    
    #Basic queries. Some of them are used as proxies to figure out the unqique queries to make later, like the "gpu_tensor_util" below
    for key, query in {
        "num_instances": "count((sum by(pod) (delta(nv_inference_request_success"+rs+"["+step+"]))) > 0)",
        "inf_rate_net":"sum (rate(nv_inference_count"+rs+"["+step+"]))",
        ##"inf_rate_bypod":"sum by(pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        ##"inf_rate":"sum by(model, version, pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        #"inf_cache_hit_rate":"sum by(model, version, pod) (rate(nv_cache_num_hits_per_model"+rs+"["+step+"]))",
        "inf_reqs_net":"sum(rate(nv_inference_request_success"+rs+"["+step+"]))",
        ##"inf_reqs_bypod":"sum by(pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        ##"inf_reqs":"sum by(model, version, pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_req_dur_net": "avg (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_req_dur": "avg by(model, version, pod) (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_que_dur": "avg by(model, version, pod) (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_inp_dur": "avg by(model, version, pod) (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_inf_dur": "avg by(model, version, pod) (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_out_dur": "avg by(model, version, pod) (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        ##"gpu_tensor_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        ##"gpu_dram_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        #"inf_cache_hits": "avg by(model, version, pod) (delta(nv_cache_num_hits_per_model["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        }.items():
        # Log the queries, as they're easier to parse after being resolved fully
        queries.append((key, query))
        # Dataframes are concatenated together along the time (index value) axis
        results[key], errors[key] = single_query_splt(timestamp_tuples, 
                                                      query, 
                                                      step=step, 
                                                      namespace=namespace,
                                                      deduplicate=deduplicate,
                                                      dataframe_mode="individual", #"unified", "individual", "naive"
                                                      prom=prom)
        
        # If we've performed a query that stores model/version info and GPU instance info, respectively, we can 
        # Create a set of unique ones for the next two sets of queries
        #if unique_model_versions is None and not isinstance(results[key], list) and hasattr(results[key], "model") and hasattr(results[key], "version"):
        #    unique_model_versions = set((results[key].model+"/"+results[key].version).values)
        # At the EAF, the device ('nvidiaX' where X is 0...4 for example), GPU instance ID (enumeration)
        # and the instance (IP address of host machine) are sufficient to make a unique identifier
        if unique_gpu_instances is None and not isinstance(results[key], list) and hasattr(results[key], "GPU_I_ID"):
            unique_gpu_instances = set((results[key].device+"/"+results[key].GPU_I_ID+"/"+results[key].instance).values)
    # Here we build the model-specific queries, getting both the number of unique number of Triton instances that served 
    # inference requests for this model, ad well as the inference rate of that model across all Triton instances active per time step
    if unique_model_versions is not None:
        model_queries = {"num_instances_"+model_version: "count((sum by(pod) (delta(nv_inference_request_success{model='"+
                         model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))) > 0)"
                         for model_version in unique_model_versions}
        model_queries.update(
            {"inf_rate_"+model_version: "sum (rate(nv_inference_count{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))"
             for model_version in unique_model_versions})
        model_queries.update(
            {"inf_req_dur_"+model_version: "avg (delta(nv_inference_request_duration_us{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])/"+
             "(0.001+delta(nv_inference_request_success{model='"+model_version.split("/")[0]+
             "',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])))"
             for model_version in unique_model_versions})
        model_queries.update(
            {"inf_que_dur_"+model_version: "avg (delta(nv_inference_queue_duration_us{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])/"+
             "(0.001+delta(nv_inference_request_success{model='"+model_version.split("/")[0]+
             "',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])))"
             for model_version in unique_model_versions})
        model_queries.update(
            {"inf_inp_dur_"+model_version: "avg (delta(nv_inference_compute_input_duration_us{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])/"+
             "(0.001+delta(nv_inference_request_success{model='"+model_version.split("/")[0]+
             "',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])))"
             for model_version in unique_model_versions})
        model_queries.update(
            {"inf_inf_dur_"+model_version: "avg (delta(nv_inference_compute_infer_duration_us{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])/"+
             "(0.001+delta(nv_inference_request_success{model='"+model_version.split("/")[0]+
             "',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])))"
             for model_version in unique_model_versions})
        model_queries.update(
            {"inf_out_dur_"+model_version: "avg (delta(nv_inference_compute_output_duration_us{model='"+
             model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])/"+
             "(0.001+delta(nv_inference_request_success{model='"+model_version.split("/")[0]+
             "',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"])))"
             for model_version in unique_model_versions})
        #"inf_req_dur_net": "avg (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        #"inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        for key, query in model_queries.items():
            queries.append((key, query))
            results[key], errors[key] = single_query_splt(timestamp_tuples, 
                                                          query, 
                                                          step=step, 
                                                          namespace=namespace,
                                                          deduplicate=deduplicate,
                                                          dataframe_mode="individual", #"unified", "individual", "naive"
                                                          prom=prom)
            if results[key] is None:
                # If somehow we got no results for this model query, remove it from the dictionary and avoid iterating over it later
                try:
                    results.pop(key)
                    unique_model_versions.remove(key.replace("inf_rate_", "").replace("num_instances_", ""))
                except:
                    pass

    # Now we gather the GPU metrics. The two most interesting ones for us are the DCGM_FI_PROF_PIPE_TENSOR_ACTIVE and 
    # DCGM_FI_PROF_DRAM_ACTIVE. The former measures how much of the compute resources (the Tensor Cores) are active, on average, in a time period
    # If the utilization is 50%, this could mean that the tensor cores for this GPU (slice) are 100% active for 50% of the time, 50% active for
    # 100% of the time, or any combination of activity_percent * time_active_percent that gives that product.
    if unique_gpu_instances is not None:
        gpu_queries = {"gpu_tensor_util_"+str(mg): "sum (avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{"+
                       "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
                       "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))" for mg, gpu_inst in enumerate(unique_gpu_instances)}
        # An example of how additional labels can filter out non-matching queries, if we do 
        # DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',
        #                          device='nvidia2',GPU_I_ID='3',instance='110.4.29.45'}[120s]
        # We'll only get metrics from that specific device, if it has a running instance with that IP, and a running GPU instance matching it
        # In this case, for each timestep, it'll get a 'vector' of instantaenous measurements within 120s
        # The avg_over_time function then measures the average over time of that 'vector' and produces a scalar result
        # The scalar result may not be unique for a given timestamp, there can be other labels attached, and a final avg is taken over all
        # of those
        gpu_queries.update(
            {"gpu_dram_util_"+str(mg): "avg (avg_over_time(DCGM_FI_PROF_DRAM_ACTIVE{"+
             "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
            "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))"
             for mg, gpu_inst in enumerate(unique_gpu_instances)})
        for key, query in gpu_queries.items():
            queries.append((key, query))
            results[key], errors[key] = single_query_splt(timestamp_tuples, 
                                                          query, 
                                                          step=step, 
                                                          namespace=namespace,
                                                          deduplicate=deduplicate,
                                                          dataframe_mode="individual", #"unified", "individual", "naive"
                                                          prom=prom)
            if results[key] is None:
                #print(f"results empty for {key}")
                try:
                    results.pop(key)
                    unique_gpu_instances.remove(key.split("_util_")[1])
                except:
                    pass
    return results, errors, queries, unique_model_versions, unique_gpu_instances


def find_active_models(timestamp_tuples, step="120s", namespace=None, prom=None):
    #st = timestamp_tuples[0][0]
    #et = timestamp_tuples[-1][1]
    #step = 
    results = single_query_splt(
        [(timestamp_tuples[0][0], timestamp_tuples[-1][1])],
        "sum by(model, version) (rate(nv_inference_count["+step+"]))",
        step=step,
        namespace=namespace,
        dataframe_mode="bypass",
        deduplicate=False,
        prom=prom
    )[0] #Only need results
    active_models = []
    inactive_models = []
    for mv in results:
        vals = mv['values']
        total = sum([float(val[1]) for val in vals])
        if total > 0:
            active_models.append(mv['metric']['model'] + "/" + mv['metric']['version'])
            #print(total, mv['metric'])
        else:
            inactive_models.append(mv['metric']['model'] + "/" + mv['metric']['version'])
            #print("0 rate: ", mv['metric'])
    return active_models, inactive_models

def find_all_gpus(timestamp_tuples, step="120s", namespace=None, prom=None):
    results = single_query_splt(
        [(timestamp_tuples[0][0], timestamp_tuples[-1][1])],
        "sum by(device, GPU_I_ID, instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        step=step,
        namespace=namespace,
        dataframe_mode="bypass",
        deduplicate=False,
        prom=prom
    )[0] #Only need results
    devices = []
    for mv in results:
        vals = mv['values']
        devices.append(mv['metric']['device'] + "/" + mv['metric']['GPU_I_ID'] + "/" + mv['metric']['instance'])
    return devices

In [None]:
results_v3, errors_v3, queries_v3, unique_model_versions_v3, unique_gpu_instances_v3 = get_all_queries_v3(
    [("2023-02-28 12:00:00", "2023-03-12 12:00:00"),
     ("2023-03-12 12:00:00", "2023-03-24 12:00:00"),
     ("2023-03-24 12:00:00", "2023-04-06 12:00:00"),
     ("2023-04-06 12:00:00", "2023-04-18 12:00:00"),
     ("2023-04-18 12:00:00", "2023-04-30 16:00:00"),
    ],
    namespace=None,
    deduplicate=True,
    step="120s",
    granular_step="1d"
)

In [None]:
unique_gpu_instances_v3

In [None]:
results_v3.keys()

In [None]:
def convert_results_to_df_v2(results, step, unique_model_versions=None, unique_gpu_instances=None, add_model_stats=True, add_gpu_stats=False):
    # This iteratively walks through some of the dataframes that are compatible and aggregates results into a 
    # unified dataframe. In each dataframe, the join call, in combination with how='left', means that results are broadcast
    # and filled with NaN wherever results may be missing from the second of the two dataframes.
    # For this reason, the 'inf_rate_net' which should have a valid value for all timestamps is used as the base.
    ##idx = pd.period_range(min(df.date), max(df.date))
    ##...: results.reindex(idx, fill_value=0)
    min_dates = []
    max_dates = []
    for k, vlist in results.items():
        for v in vlist:
            min_dates.append(min(v.index.values))
            max_dates.append(max(v.index.values))
    min_date = min(min_dates)
    max_date = max(max_dates)
    new_index = pd.date_range(min_date, max_date, freq=step)
    ret = None
    for k, vlist in results.items():
        it = 0
        if len(vlist) > 1:
            print(f"Unable to add results column {k} due to multiple un-keyed results")
        else:
            # Make this an interable dict being returned from the split-query mode, then the above exception can be removed
            for v in vlist:
                it += 1
                try:
                    tmp = v.reindex(new_index, fill_value=0)
                except:
                    print(f"failure to reindex {k} {it} --> {v.columns}")
                    return new_index, v
            if ret is None:
                ret = tmp.rename(columns={"value": k})
            else:
                assert np.all(ret.index.values == tmp.index.values), "Mismatched Time Indices Detected"
                ret.loc[:, k] = tmp.value
    return ret
        
checkcheck = convert_results_to_df_v2({k: v for k, v in results_v3.items()}, step="120s")

In [None]:
cc = checkcheck
cc

In [None]:
cc.to_csv("test_pn_demos.csv", sep='\t')

In [None]:
queues = cc.loc[:, ['inf_que_dur_emj_gnn_aligned/1',
             'inf_que_dur_pn_demo/1',
             'inf_que_dur_pn_demo_bkg1/1',
             'inf_que_dur_pn_demo_bkg2/1',
             'inf_que_dur_pn_demo_bkg3/1',
             'inf_que_dur_reconstruction_bdt_xgb/1',
             'inf_que_dur_svj_tch_gnn/1']]

In [None]:
max_ms = queues.max(axis=1)/1000

In [None]:
plt.scatter(cc['inf_rate_net']/cc['num_instances'], max_ms.values)
plt.ylim(0, 1500)

In [None]:
    
def the_rest():
    new_index = pd.period_range(min_date, max_date)
    for key in ["inf_rate_net", "inf_reqs_net", "inf_req_dur_net", "inf_que_dur_net", "inf_inp_dur_net", "inf_inf_dur_net", "inf_out_dur_net"]:
        results[key].reindex(new_index, fill_value=0)
    i0 = results["inf_rate_net"].join(results["num_instances"],
                                      how="left", 
                                      rsuffix="_num_instances",
                                     )
    # We use the rsuffix and lsuffix to convert column names from 'value' to one that is understandable/parseable later on. 
    i0 = i0.join(results["inf_reqs_net"],
                 how="left",
                 rsuffix="_inf_reqs_net")
    i0 = i0.join(results["inf_req_dur_net"],
                 how="left",
                 rsuffix="_inf_req_dur_net")
    i0 = i0.join(results["inf_que_dur_net"],
                 how="left",
                 rsuffix="_inf_que_dur_net")
    i0 = i0.join(results["inf_inp_dur_net"],
                 how="left",
                 rsuffix="_inf_inp_dur_net")
    i0 = i0.join(results["inf_inf_dur_net"],
                 how="left",
                 rsuffix="_inf_inf_dur_net")
    i0 = i0.join(results["inf_out_dur_net"],
                 how="left",
                 rsuffix="_inf_out_dur_net")
    
    #Add the model metrics, using some suffix parsing to make it into num_instances_X or rate_X where X is the model name
    if add_model_stats:
        for model in track(unique_model_versions, description="Adding Model Stats"):
            itemp = results["inf_rate_" + model].join(results["num_instances_" + model],
                                                      how="left",
                                                      rsuffix="_num_instances_"+model.split("/")[0],
                                                      lsuffix="_rate_"+model.split("/")[0],
                                                     )
            i0 = i0.join(itemp, how="left")
        
    #Add the GPU Instance metrics, including GPU instance enumeration
    if add_gpu_stats:
        for mg, gpu in track(enumerate(unique_gpu_instances), description="Adding GPU Stats"):
            results["gpu_tensor_util_" + str(mg)].fillna(0, inplace=True)
            results["gpu_dram_util_" + str(mg)].fillna(0, inplace=True)
            itemp = results["gpu_tensor_util_" + str(mg)].join(results["gpu_dram_util_" + str(mg)],
                                                      how="left",
                                                      rsuffix="_gpu_dram_util_"+str(mg),
                                                      lsuffix="_gpu_tensor_util_"+str(mg),
                                                     )
            i0 = i0.join(itemp, how="left")

    #Get rid of the "value" in column names, and fill NaN values with 0 everywhere
    i0.rename(columns={"value": "rate"}, inplace=True)
    i0.rename(columns={col:col[6:] for col in i0.columns if col.startswith("value_")}, inplace=True)
    i0.fillna(0, inplace=True)
    
    # Aggregate some stats for models
    # The summed rate and total inference rate should match, otherwise we've double-counted something
    # The summed instances may NOT match: if a model is active on 5 of 10 servers in a timestep, and another is active on 7 of 10
    # Then there will be '12' active instances in that timestep, net. This number divided by the net_instances
    # Therefore gives a measure of the 'average' model concurrency in a timestep. 10 net_instances and 70 summed_intstances
    # would indicate each instances was serving 7 models at some point in that timestep (but this is a lossy gathering of information,
    # 6 models could do one inference request while the last model is responsible for all of the remainder of thousands of requests.
    valid_model_keys = [col for col in i0.columns if col.startswith("rate_") and col.replace("rate_", "num_instances_") in i0.columns]
    i0["summed_rate"] = sum([i0[col] for col in valid_model_keys])
    i0["summed_instances"] = sum([i0[col.replace("rate_", "num_instances_")] for col in valid_model_keys])
    
    # Aggregate some stats for GPU instances
    valid_gpu_keys = [col for col in i0.columns if col.startswith("gpu_tensor_util") and col.replace("tensor", "dram") in i0.columns]
    i0["summed_gpu_tensor_util"] = sum([i0[col] for col in valid_gpu_keys])
    i0["summed_gpu_dram_util"] = sum([i0[col.replace("tensor", "dram")] for col in valid_model_keys])
 

In [None]:
start = time.time()
test2 = single_query_splt([     
     ("2023-02-17 at 00:00:00 MDT", "2023-02-20 at 00:00:00 MDT"),
     ("2023-02-20 at 00:00:00 MDT", "2023-02-23 at 00:00:00 MDT"),
     ("2023-02-23 at 00:00:00 MDT", "2023-02-26 at 00:00:00 MDT"),
     ("2023-02-26 at 00:00:00 MDT", "2023-03-01 at 00:00:00 MDT"),
     ("2023-03-01 at 00:00:00 MDT", "2023-03-04 at 00:00:00 MDT"),
     ("2023-03-04 at 00:00:00 MDT", "2023-03-07 at 00:00:00 MDT"),
     ("2023-03-07 at 00:00:00 MDT", "2023-03-10 at 00:00:00 MDT"),
     ("2023-03-10 at 00:00:00 MDT", "2023-03-13 at 00:00:00 MDT"),
     ("2023-03-13 at 00:00:00 MDT", "2023-03-16 at 00:00:00 MDT"),
     ("2023-03-16 at 00:00:00 MDT", "2023-03-19 at 00:00:00 MDT"),
     ("2023-03-19 at 00:00:00 MDT", "2023-03-22 at 00:00:00 MDT"),
     ("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
     ("2023-03-31 at 00:00:00 MDT", "2023-04-03 at 00:00:00 MDT"),
     ("2023-04-03 at 00:00:00 MDT", "2023-04-06 at 00:00:00 MDT"),  
     ("2023-04-06 at 00:00:00 MDT", "2023-04-09 at 00:00:00 MDT"),
     ("2023-04-09 at 00:00:00 MDT", "2023-04-12 at 00:00:00 MDT"),
     ("2023-04-12 at 00:00:00 MDT", "2023-04-15 at 00:00:00 MDT"),
     ("2023-04-15 at 00:00:00 MDT", "2023-04-18 at 00:00:00 MDT"),
    ], 
    "sum by(model, version) (rate(nv_inference_count[120s]))",
    step="120s",
    namespace=None,
    dataframe_mode="individual",
    deduplicate=False
)
print(time.time() - start)

In [None]:
test2[0][0]

In [None]:
for x in test2[0]:
    print(type(x))

In [None]:
start = time.time()
test4 = run_single_query([     
     ("2023-02-17 at 00:00:00 MDT", "2023-02-20 at 00:00:00 MDT"),
     ("2023-02-20 at 00:00:00 MDT", "2023-02-23 at 00:00:00 MDT"),
     ("2023-02-23 at 00:00:00 MDT", "2023-02-26 at 00:00:00 MDT"),
     ("2023-02-26 at 00:00:00 MDT", "2023-03-01 at 00:00:00 MDT"),
     ("2023-03-01 at 00:00:00 MDT", "2023-03-04 at 00:00:00 MDT"),
     ("2023-03-04 at 00:00:00 MDT", "2023-03-07 at 00:00:00 MDT"),
     ("2023-03-07 at 00:00:00 MDT", "2023-03-10 at 00:00:00 MDT"),
     ("2023-03-10 at 00:00:00 MDT", "2023-03-13 at 00:00:00 MDT"),
     ("2023-03-13 at 00:00:00 MDT", "2023-03-16 at 00:00:00 MDT"),
     ("2023-03-16 at 00:00:00 MDT", "2023-03-19 at 00:00:00 MDT"),
     ("2023-03-19 at 00:00:00 MDT", "2023-03-22 at 00:00:00 MDT"),
     ("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
     ("2023-03-31 at 00:00:00 MDT", "2023-04-03 at 00:00:00 MDT"),
     ("2023-04-03 at 00:00:00 MDT", "2023-04-06 at 00:00:00 MDT"),
     ("2023-04-06 at 00:00:00 MDT", "2023-04-09 at 00:00:00 MDT"),
     ("2023-04-09 at 00:00:00 MDT", "2023-04-12 at 00:00:00 MDT"),
     ("2023-04-12 at 00:00:00 MDT", "2023-04-15 at 00:00:00 MDT"),
     ("2023-04-15 at 00:00:00 MDT", "2023-04-18 at 00:00:00 MDT"),
    ], 
    "sum by(model, version) (rate(nv_inference_count[120s]))",
    step="120s",
    namespace=None,
    deduplicate=False
)
print(time.time() - start)

In [None]:
for x in test4[0]:
    print(type(x))

In [None]:
subresults = 0
for time_slice in test:
    for it in time_slice:
        for it2 in it:
            subresults += 1
print(subresults)

In [None]:
new_results = {}
for time_slice in test:
    print("========")
    for it in time_slice:
        print(type(it))
        for it2 in it:
            key = prom_query_hash(it2)
            if key not in new_results:
                new_results[key] = it2
            else:
                new_results[key] = prom_query_add(new_results[key], it2)
       # print(prom_query_hash(it))

In [None]:
for k in new_results.keys():
    tmp = k.split("($)")
    if len(tmp) != 4:
        print(k, len(tmp))

In [None]:
for x in new_results['($)model::deeptau_ensemble($)version::1']['values']:
    if x[1] != '0':
        print(x)

In [None]:
count_models = dict()
for key in new_results:
    mk = key.split("($)")
    key = mk[0] + mk[2]
    if key not in count_models:
        count_models[key] = 1
    else:
        count_models[key] += 1
print(count_models.values())
print(list(count_models.keys())[0:5])

In [None]:
type(test[0]), len(test[0])

In [None]:
for x in test[0][0]:
    #print(type(x), x.keys(), x['metric'])
    print(type(x), x.keys(), type(x['values']), x['values'][0] )

In [None]:
results_v2, errors_v2, queries_v2, unique_model_versions_v2, unique_gpu_instances_v2 = get_all_queries_v2(
    [("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
     ("2023-03-31 at 00:00:00 MDT", "2023-04-02 at 00:00:00 MDT"),
    ],
    namespace=None,
    step="120s")

In [None]:
len(results_v2['inf_rate_net'].index), len(set(results_v2['inf_rate_net'].index))

In [None]:
results_v2, errors_v2, queries_v2, unique_model_versions_v2, unique_gpu_instances_v2 = get_all_queries_v2(
    [#("2023-01-30 at 00:00:00 MDT", "2023-02-02 at 00:00:00 MDT"),
     #("2023-02-02 at 00:00:00 MDT", "2023-02-05 at 00:00:00 MDT"),
     #("2023-02-05 at 00:00:00 MDT", "2023-02-08 at 00:00:00 MDT"),
     #("2023-02-08 at 00:00:00 MDT", "2023-02-11 at 00:00:00 MDT"),
     #("2023-02-11 at 00:00:00 MDT", "2023-02-14 at 00:00:00 MDT"),
     #("2023-02-14 at 00:00:00 MDT", "2023-02-17 at 00:00:00 MDT"),
        
     ("2023-02-17 at 00:00:00 MDT", "2023-02-20 at 00:00:00 MDT"),
     ("2023-02-20 at 00:00:00 MDT", "2023-02-23 at 00:00:00 MDT"),
     ("2023-02-23 at 00:00:00 MDT", "2023-02-26 at 00:00:00 MDT"),
     ("2023-02-26 at 00:00:00 MDT", "2023-03-01 at 00:00:00 MDT"),
     ("2023-03-01 at 00:00:00 MDT", "2023-03-04 at 00:00:00 MDT"),
     ("2023-03-04 at 00:00:00 MDT", "2023-03-07 at 00:00:00 MDT"),
     ("2023-03-07 at 00:00:00 MDT", "2023-03-10 at 00:00:00 MDT"),
     ("2023-03-10 at 00:00:00 MDT", "2023-03-13 at 00:00:00 MDT"),
     ("2023-03-13 at 00:00:00 MDT", "2023-03-16 at 00:00:00 MDT"),
     ("2023-03-16 at 00:00:00 MDT", "2023-03-19 at 00:00:00 MDT"),
     ("2023-03-19 at 00:00:00 MDT", "2023-03-22 at 00:00:00 MDT"),
     ("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
     ("2023-03-31 at 00:00:00 MDT", "2023-04-03 at 00:00:00 MDT"),
     ("2023-04-03 at 00:00:00 MDT", "2023-04-06 at 00:00:00 MDT"),
        
     ("2023-04-06 at 00:00:00 MDT", "2023-04-09 at 00:00:00 MDT"),
     ("2023-04-09 at 00:00:00 MDT", "2023-04-12 at 00:00:00 MDT"),
     ("2023-04-12 at 00:00:00 MDT", "2023-04-15 at 00:00:00 MDT"),
     ("2023-04-15 at 00:00:00 MDT", "2023-04-18 at 00:00:00 MDT"),
    ],
    namespace=None,
    step="120s")

In [None]:
test = run_single_query(
    [     
        ("2023-02-17 at 00:00:00 MDT", "2023-02-20 at 00:00:00 MDT"),
     ("2023-02-20 at 00:00:00 MDT", "2023-02-23 at 00:00:00 MDT"),
     ("2023-02-23 at 00:00:00 MDT", "2023-02-26 at 00:00:00 MDT"),
     ("2023-02-26 at 00:00:00 MDT", "2023-03-01 at 00:00:00 MDT"),
     ("2023-03-01 at 00:00:00 MDT", "2023-03-04 at 00:00:00 MDT"),
     ("2023-03-04 at 00:00:00 MDT", "2023-03-07 at 00:00:00 MDT"),
     ("2023-03-07 at 00:00:00 MDT", "2023-03-10 at 00:00:00 MDT"),
     ("2023-03-10 at 00:00:00 MDT", "2023-03-13 at 00:00:00 MDT"),
     ("2023-03-13 at 00:00:00 MDT", "2023-03-16 at 00:00:00 MDT"),
     ("2023-03-16 at 00:00:00 MDT", "2023-03-19 at 00:00:00 MDT"),
     ("2023-03-19 at 00:00:00 MDT", "2023-03-22 at 00:00:00 MDT"),
     ("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
     ("2023-03-31 at 00:00:00 MDT", "2023-04-03 at 00:00:00 MDT"),
     ("2023-04-03 at 00:00:00 MDT", "2023-04-06 at 00:00:00 MDT"),
        
     ("2023-04-06 at 00:00:00 MDT", "2023-04-09 at 00:00:00 MDT"),
     ("2023-04-09 at 00:00:00 MDT", "2023-04-12 at 00:00:00 MDT"),
     ("2023-04-12 at 00:00:00 MDT", "2023-04-15 at 00:00:00 MDT"),
     ("2023-04-15 at 00:00:00 MDT", "2023-04-18 at 00:00:00 MDT"),
    ], 
    "sum by(model, version, pod) (rate(nv_inference_count[120s]))",
    step="120s",
    namespace=None,
    deduplicate=False
)

In [None]:
len(test[0])

In [None]:
len(results_v2["num_instances"])

In [None]:
results_v2.keys()

In [None]:
results_v2['inf_rate'][-5:]

In [None]:
for k, v in results_v2.items():
    if not isinstance(v, pd.core.frame.DataFrame):
        print(k, type(v))

In [None]:
unique_model_versions_v2

In [None]:
# A function for getting queries of many GPU and Triton server metrics. Inputs are a list of timestamp tuples,
# which can be parsed by the prometheus_api_client.utils.parse_datetime function. This can understand timestamps formatted like
# "2023-03-30 at 16:00:00 MDT"
# The step is the 'time-window' over which each query will be divided. This should be ~4x as long as the longest frequency for metric-gather
def get_all_queries(timestamp_tuples, step, namespace='triton'):
    # FIXME: Refactor this into a single-query function (plus, a model_version single-query function, and another for the GPU stats)
    # Then make calls to those function(s) with a wrapping function containing these queries.
    rs = ""
    rsm = ""
    if isinstance(namespace, str):
        rs = "{namespace='"+namespace+"'}"
        rsm = ",namespace='"+namespace+"'"
    # A dictionary for our results
    results = {}
    # Tuples of the queries we'll make, for debugging and info
    queries = []
    
    # Some queries are best created after understanding which unique models+version have been run in the triton servers
    # and which GPU instances have been active. These are then used to formulate model/version-specific and GPU-specific stats
    unique_model_versions = None
    unique_gpu_instances = None
    
    #Basic queries. Some of them are used as proxies to figure out the unqique queries to make later, like the "gpu_tensor_util" below
    for key, query in track({
        "num_instances": "count((sum by(pod) (delta(nv_inference_request_success"+rs+"["+step+"]))) > 0)",
        "inf_rate_net":"sum (rate(nv_inference_count"+rs+"["+step+"]))",
        "inf_rate_bypod":"sum by(pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        "inf_rate":"sum by(model, version, pod) (rate(nv_inference_count"+rs+"["+step+"]))",
        "inf_cache_hit_rate":"sum by(model, version, pod) (rate(nv_cache_num_hits_per_model"+rs+"["+step+"]))",
        "inf_reqs_net":"sum(rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_reqs_bypod":"sum by(pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_reqs":"sum by(model, version, pod) (rate(nv_inference_request_success"+rs+"["+step+"]))",
        "inf_req_dur_net": "avg (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_que_dur_net": "avg (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inp_dur_net": "avg (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inf_dur_net": "avg (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_out_dur_net": "avg (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_req_dur": "avg by(model, version, pod) (delta(nv_inference_request_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_que_dur": "avg by(model, version, pod) (delta(nv_inference_queue_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inp_dur": "avg by(model, version, pod) (delta(nv_inference_compute_input_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_inf_dur": "avg by(model, version, pod) (delta(nv_inference_compute_infer_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "inf_out_dur": "avg by(model, version, pod) (delta(nv_inference_compute_output_duration_us"+rs+"["+step+"])/(0.001+delta(nv_inference_request_success"+rs+"["+step+"])))",
        "gpu_tensor_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        "gpu_dram_util": "sum by(device,GPU_I_ID,instance) (avg_over_time (DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0'}["+step+"]))",
        #"inf_cache_hits": "avg by(model, version, pod) (delta(nv_cache_num_hits_per_model["+step+"])/(1+1000000*delta(nv_inference_request_success["+step+"])))",
        }.items(), description="Processing Queries..."):
        # Build an empty list for these results; after iterating through all the timestamp pairs, they'll be concatenated together
        results[key] = []
        # Log the queries, as they're easier to parse after being resolved fully
        queries.append((key, query))
        # This function executes a query for each timestamp pair, for each key:query
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            # Queries are converted to a pandas dataframe
            df = MetricRangeDataFrame(test_inp)
            results[key].append(df)
        # Dataframes are concatenated together along the time (index value) axis
        results[key] = pd.concat(results[key], axis=0)
        
        # If we've performed a query that stores model/version info and GPU instance info, respectively, we can 
        # Create a set of unique ones for the next two sets of queries
        if unique_model_versions is None and hasattr(results[key], "model") and hasattr(results[key], "version"):
            unique_model_versions = set((results[key].model+"/"+results[key].version).values)
        # At the EAF, the device ('nvidiaX' where X is 0...4 for example), GPU instance ID (enumeration)
        # and the instance (IP address of host machine) are sufficient to make a unique identifier
        if unique_gpu_instances is None and hasattr(results[key], "GPU_I_ID"):
            unique_gpu_instances = set((results[key].device+"/"+results[key].GPU_I_ID+"/"+results[key].instance).values)
    # Here we build the model-specific queries, getting both the number of unique number of Triton instances that served 
    # inference requests for this model, ad well as the inference rate of that model across all Triton instances active per time step
    model_queries = {"num_instances_"+model_version: "count((sum by(pod) (delta(nv_inference_request_success{model='"+
                     model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))) > 0)"
                     for model_version in unique_model_versions}
    model_queries.update(
        {"inf_rate_"+model_version: "sum (rate(nv_inference_count{model='"+
         model_version.split("/")[0]+"',version='"+model_version.split("/")[1]+"'"+rsm+"}["+step+"]))"
         for model_version in unique_model_versions})
    for key, query in model_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            # The query could be empty, as a model only served in a portion of the total timerange could be inactive in some timestamp-pairs.
            # We will deal with broadcasting these dataframes with missing values later
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
        else:
            # If somehow we got no results for this model query, remove it from the dictionary and avoid iterating over it later
            try:
                results.pop(key)
                unique_model_versions.remove(key.replace("inf_rate_", "").replace("num_instances_", ""))
            except:
                pass
            
    # Now we gather the GPU metrics. The two most interesting ones for us are the DCGM_FI_PROF_PIPE_TENSOR_ACTIVE and 
    # DCGM_FI_PROF_DRAM_ACTIVE. The former measures how much of the compute resources (the Tensor Cores) are active, on average, in a time period
    # If the utilization is 50%, this could mean that the tensor cores for this GPU (slice) are 100% active for 50% of the time, 50% active for
    # 100% of the time, or any combination of activity_percent * time_active_percent that gives that product.
    gpu_queries = {"gpu_tensor_util_"+str(mg): "sum (avg_over_time(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{"+
                   "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
                   "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))" for mg, gpu_inst in enumerate(unique_gpu_instances)}
    # An example of how additional labels can filter out non-matching queries, if we do 
    # DCGM_FI_PROF_DRAM_ACTIVE{exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',
    #                          device='nvidia2',GPU_I_ID='3',instance='110.4.29.45'}[120s]
    # We'll only get metrics from that specific device, if it has a running instance with that IP, and a running GPU instance matching it
    # In this case, for each timestep, it'll get a 'vector' of instantaenous measurements within 120s
    # The avg_over_time function then measures the average over time of that 'vector' and produces a scalar result
    # The scalar result may not be unique for a given timestamp, there can be other labels attached, and a final avg is taken over all
    # of those
    gpu_queries.update(
        {"gpu_dram_util_"+str(mg): "avg (avg_over_time(DCGM_FI_PROF_DRAM_ACTIVE{"+
         "exported_container='triton',exported_namespace='triton',prometheus_replica='prometheus-k8s-0',"+
        "device='"+gpu_inst.split("/")[0]+"',GPU_I_ID='"+gpu_inst.split("/")[1]+"',instance='"+gpu_inst.split("/")[2]+"'}["+step+"]))"
         for mg, gpu_inst in enumerate(unique_gpu_instances)})
    for key, query in gpu_queries.items():
        queries.append((key, query))
        results[key] = []
        for st, et in timestamp_tuples:
            test_inp = prom.custom_query_range(
                query=query,
                start_time=parse_datetime(st),
                end_time=parse_datetime(et),
                step=step
            )
            if len(test_inp) > 0:
                df = MetricRangeDataFrame(test_inp)
                results[key].append(df)
        if len(results[key]) > 0:
            results[key] = pd.concat(results[key], axis=0)
            #print(key)
        else:
            #print(f"results empty for {key}")
            try:
                results.pop(key)
                unique_gpu_instances.remove(key.split("_util_")[1])
            except:
                pass
    return results, queries, unique_model_versions, unique_gpu_instances

In addition to the query dataframes, a list of the key:query pairs and the different model/versions active, and GPU MIG slices active, will be recorded

In [None]:
#results, queries, unique_model_versions, unique_gpu_instances = get_all_queries(
#    [#("2023-01-30 at 00:00:00 MDT", "2023-02-02 at 00:00:00 MDT"),
     #("2023-02-02 at 00:00:00 MDT", "2023-02-05 at 00:00:00 MDT"),
     #("2023-02-05 at 00:00:00 MDT", "2023-02-08 at 00:00:00 MDT"),
     #("2023-02-08 at 00:00:00 MDT", "2023-02-11 at 00:00:00 MDT"),
     #("2023-02-11 at 00:00:00 MDT", "2023-02-14 at 00:00:00 MDT"),
     #("2023-02-14 at 00:00:00 MDT", "2023-02-17 at 00:00:00 MDT"),
#     ("2023-02-17 at 00:00:00 MDT", "2023-02-20 at 00:00:00 MDT"),
#     ("2023-02-20 at 00:00:00 MDT", "2023-02-23 at 00:00:00 MDT"),
#     ("2023-02-23 at 00:00:00 MDT", "2023-02-26 at 00:00:00 MDT"),
#     ("2023-02-26 at 00:00:00 MDT", "2023-03-01 at 00:00:00 MDT"),
#     ("2023-03-01 at 00:00:00 MDT", "2023-03-04 at 00:00:00 MDT"),
#     ("2023-03-04 at 00:00:00 MDT", "2023-03-07 at 00:00:00 MDT"),
#     ("2023-03-07 at 00:00:00 MDT", "2023-03-10 at 00:00:00 MDT"),
#     ("2023-03-10 at 00:00:00 MDT", "2023-03-13 at 00:00:00 MDT"),
#     ("2023-03-13 at 00:00:00 MDT", "2023-03-16 at 00:00:00 MDT"),
#     ("2023-03-16 at 00:00:00 MDT", "2023-03-19 at 00:00:00 MDT"),
#     ("2023-03-19 at 00:00:00 MDT", "2023-03-22 at 00:00:00 MDT"),
#     ("2023-03-22 at 00:00:00 MDT", "2023-03-25 at 00:00:00 MDT"),
#     ("2023-03-25 at 00:00:00 MDT", "2023-03-28 at 00:00:00 MDT"),
#     ("2023-03-28 at 00:00:00 MDT", "2023-03-31 at 00:00:00 MDT"),
#     ("2023-03-31 at 00:00:00 MDT", "2023-04-03 at 00:00:00 MDT"),
#     ("2023-04-03 at 00:00:00 MDT", "2023-04-05 at 12:30:00 MDT"),
#    ],
#    namespace=None,
#    step="60s")

In [None]:
for k, v in results_v2.items():
    print(k, len(v), len(set(v.index)))

## Converting to a unified dataframe
This function takes the subset of results that can be concatenated into new columns,
including the inference rate (with breakdownds by model), the timing of the inference request (overall request time, as well as broken down into queue time, input time, compute time, and output time), and the GPU dram and tensor utilization. See NVidia docs for more information on how these quantities are calculated and stored

In [None]:
results_v2['inf_rate_net'][-15:], results_v2['num_instances'][-15:]

In [None]:
import copy
results = copy.deepcopy(results_v2)

In [None]:
uniform_index = None
for k, v in results.items():
    if uniform_index is None:
        uniform_index = set(v.index.values)
    else:
        uniform_index.update(set(v.index.values))
uniform_index = list(uniform_index)
print(len(uniform_index))

In [None]:
len(uniform_index)

In [None]:
test = results['gpu_dram_util_10'].reindex(uniform_index, fill_value=0)

In [None]:
test.iloc[[results['gpu_dram_util_10'].index]]

In [None]:
len(test)

In [None]:

new_index = pd.period_range(min_date, max_date)
for key in results.keys():
    print(key)
    old_len = len(results[key])
    #print(len(new_index), old_len)
    results[key].reindex(new_index, fill_value=0)
    new_len = len(results[key])
    print(old_len, new_len)

In [None]:
print(results.keys())

In [None]:
xxx = results[key]
xxx.reindex?

In [None]:
len(set(results["inf_rate_net"].index))

In [None]:
def convert_results_to_df(results, unique_model_versions=None, unique_gpu_instances=None, add_model_stats=True, add_gpu_stats=False):
    # This iteratively walks through some of the dataframes that are compatible and aggregates results into a 
    # unified dataframe. In each dataframe, the join call, in combination with how='left', means that results are broadcast
    # and filled with NaN wherever results may be missing from the second of the two dataframes.
    # For this reason, the 'inf_rate_net' which should have a valid value for all timestamps is used as the base.
    ##idx = pd.period_range(min(df.date), max(df.date))
    ##...: results.reindex(idx, fill_value=0)
    min_dates = []
    max_dates = []
    for k, v in results.items():
        min_dates.append(min(v.index))
        max_dates.append(max(v.index))
    min_date = min(min_dates)
    max_date = max(max_dates)
    new_index = pd.period_range(min_date, max_date)
    for key in ["inf_rate_net", "inf_reqs_net", "inf_req_dur_net", "inf_que_dur_net", "inf_inp_dur_net", "inf_inf_dur_net", "inf_out_dur_net"]:
        results[key].reindex(new_index, fill_value=0)
    i0 = results["inf_rate_net"].join(results["num_instances"],
                                      how="left", 
                                      rsuffix="_num_instances",
                                     )
    # We use the rsuffix and lsuffix to convert column names from 'value' to one that is understandable/parseable later on. 
    i0 = i0.join(results["inf_reqs_net"],
                 how="left",
                 rsuffix="_inf_reqs_net")
    i0 = i0.join(results["inf_req_dur_net"],
                 how="left",
                 rsuffix="_inf_req_dur_net")
    i0 = i0.join(results["inf_que_dur_net"],
                 how="left",
                 rsuffix="_inf_que_dur_net")
    i0 = i0.join(results["inf_inp_dur_net"],
                 how="left",
                 rsuffix="_inf_inp_dur_net")
    i0 = i0.join(results["inf_inf_dur_net"],
                 how="left",
                 rsuffix="_inf_inf_dur_net")
    i0 = i0.join(results["inf_out_dur_net"],
                 how="left",
                 rsuffix="_inf_out_dur_net")
    
    #Add the model metrics, using some suffix parsing to make it into num_instances_X or rate_X where X is the model name
    if add_model_stats:
        for model in track(unique_model_versions, description="Adding Model Stats"):
            itemp = results["inf_rate_" + model].join(results["num_instances_" + model],
                                                      how="left",
                                                      rsuffix="_num_instances_"+model.split("/")[0],
                                                      lsuffix="_rate_"+model.split("/")[0],
                                                     )
            i0 = i0.join(itemp, how="left")
        
    #Add the GPU Instance metrics, including GPU instance enumeration
    if add_gpu_stats:
        for mg, gpu in track(enumerate(unique_gpu_instances), description="Adding GPU Stats"):
            results["gpu_tensor_util_" + str(mg)].fillna(0, inplace=True)
            results["gpu_dram_util_" + str(mg)].fillna(0, inplace=True)
            itemp = results["gpu_tensor_util_" + str(mg)].join(results["gpu_dram_util_" + str(mg)],
                                                      how="left",
                                                      rsuffix="_gpu_dram_util_"+str(mg),
                                                      lsuffix="_gpu_tensor_util_"+str(mg),
                                                     )
            i0 = i0.join(itemp, how="left")

    #Get rid of the "value" in column names, and fill NaN values with 0 everywhere
    i0.rename(columns={"value": "rate"}, inplace=True)
    i0.rename(columns={col:col[6:] for col in i0.columns if col.startswith("value_")}, inplace=True)
    i0.fillna(0, inplace=True)
    
    # Aggregate some stats for models
    # The summed rate and total inference rate should match, otherwise we've double-counted something
    # The summed instances may NOT match: if a model is active on 5 of 10 servers in a timestep, and another is active on 7 of 10
    # Then there will be '12' active instances in that timestep, net. This number divided by the net_instances
    # Therefore gives a measure of the 'average' model concurrency in a timestep. 10 net_instances and 70 summed_intstances
    # would indicate each instances was serving 7 models at some point in that timestep (but this is a lossy gathering of information,
    # 6 models could do one inference request while the last model is responsible for all of the remainder of thousands of requests.
    valid_model_keys = [col for col in i0.columns if col.startswith("rate_") and col.replace("rate_", "num_instances_") in i0.columns]
    i0["summed_rate"] = sum([i0[col] for col in valid_model_keys])
    i0["summed_instances"] = sum([i0[col.replace("rate_", "num_instances_")] for col in valid_model_keys])
    
    # Aggregate some stats for GPU instances
    valid_gpu_keys = [col for col in i0.columns if col.startswith("gpu_tensor_util") and col.replace("tensor", "dram") in i0.columns]
    i0["summed_gpu_tensor_util"] = sum([i0[col] for col in valid_gpu_keys])
    i0["summed_gpu_dram_util"] = sum([i0[col.replace("tensor", "dram")] for col in valid_model_keys])
    return i0

In [None]:
i0 = convert_results_to_df(results_v2, unique_model_versions_v2, unique_gpu_instances_v2, False, True)
i0

In [None]:
i0.columns

In [None]:
np.rint(i0.num_instances)

In [None]:
cc = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", 
      "tab:brown", "tab:gray", "tab:olive", "tab:cyan", "tab:pink"]
for nm, mod in enumerate(unique_model_versions):
    model = mod.split("/")[0]
    ii = i0[getattr(i0, "rate_"+model) > 0]
    plt.scatter(ii.inf_reqs_net, getattr(ii, "rate_"+model), c=cc[nm], label=model, alpha=0.2)
plt.scatter(i0.inf_reqs_net, i0.rate, c=cc[-1], label="All", alpha=0.2)
plt.ylabel("Inference Rate")
plt.xlabel("Inference Requests [All]")
plt.legend()
plt.title("Rate vs Inference Requests")
plt.savefig("rate_vs_requests.pdf")

In [None]:
cc = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", 
      "tab:brown", "tab:gray", "tab:olive", "tab:cyan", "tab:pink"]
for nm, mod in enumerate(unique_model_versions):
    model = mod.split("/")[0]
    ii = i0[getattr(i0, "rate_"+model) > 0]
    plt.scatter(getattr(ii, "num_instances_"+model), getattr(ii, "rate_"+model), c=cc[nm], label=model, alpha=0.2)
plt.scatter(i0.num_instances, i0.rate, c=cc[-1], label="All", alpha=0.2)
plt.ylabel("Inference Rate")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("rate_vs_servers_scatter.pdf")

In [None]:
cc = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", 
      "tab:brown", "tab:gray", "tab:olive", "tab:cyan", "tab:pink"]
iter_models = unique_model_versions.union({"All/1"})
print({mod:mod for mod in iter_models})
data   = {mod.split("/")[0]: [] for mod in iter_models}
points = {mod.split("/")[0]: [] for mod in iter_models}
colors = {mod.split("/")[0]: [] for mod in iter_models}
print(data.keys())
labels = []
for act_srv in range(1, 11):
    for nm, mod in enumerate(unique_model_versions):
        model = mod.split("/")[0]
        if model == "":
            continue
        ii=None
        ii = i0[(getattr(i0, "rate_"+model) > 0) & (getattr(i0, "num_instances_"+model) == act_srv)]
        if len(ii) == 0:
            #print("Skipping")
            continue
        data[model].append(getattr(ii, "rate_"+model))
        colors[model].append(cc[nm])
        points[model].append(act_srv)
    model = "All"
    ii = i0[(getattr(i0, "rate") > 0) & (getattr(i0, "num_instances") == act_srv)]
    if len(ii) == 0:
        continue
    data[model].append(getattr(ii, "rate"))
    colors[model].append(cc[-1])
    points[model].append(act_srv)

for mod in unique_model_versions:
    model = mod.split("/")[0]
    plt.violinplot(data[model], points[model])
#plt.scatter(i0.num_instances, i0.rate, c=cc[-1], label="All")
plt.ylabel("Inference Rate")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("rate_vs_servers_violin_models.pdf")

In [None]:
for mod in {"All/1"}:
    model = mod.split("/")[0]
    print(model)
    #print([len(xx) for xx in data[model]])
    print(points[model])
    plt.violinplot(data[model], points[model])
#plt.scatter(i0.num_instances, i0.rate, c=cc[-1], label="All")
plt.ylabel("Inference Rate")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("rate_vs_servers_violin_net.pdf")

In [None]:
isort = i0.sort_values("num_instances", axis=0, ascending=True)
cc = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", 
      "tab:brown", "tab:gray", "tab:olive", "tab:cyan", "tab:pink"]
val = {"inf_que_dur_net": [],
       "inf_inp_dur_net": [],
       "inf_inf_dur_net": [],
       "inf_out_dur_net": [],
       "inf_req_dur_net": [],
       }
err = {"inf_que_dur_net": [],
       "inf_inp_dur_net": [],
       "inf_inf_dur_net": [],
       "inf_out_dur_net": [],
       "inf_req_dur_net": [],
       }
ninst = [x for x in range(int(np.max(isort.num_instances)))]
for ats in ninst:
    icut = isort[isort.num_instances == ats]
    val["inf_que_dur_net"].append(np.mean(icut.inf_que_dur_net/icut.inf_req_dur_net))
    err["inf_que_dur_net"].append(np.sqrt(np.var(icut.inf_que_dur_net/icut.inf_req_dur_net)))
    val["inf_inp_dur_net"].append(np.mean(icut.inf_inp_dur_net/icut.inf_req_dur_net))
    err["inf_inp_dur_net"].append(np.sqrt(np.var(icut.inf_inp_dur_net/icut.inf_req_dur_net)))
    val["inf_inf_dur_net"].append(np.mean(icut.inf_inf_dur_net/icut.inf_req_dur_net))
    err["inf_inf_dur_net"].append(np.sqrt(np.var(icut.inf_inf_dur_net/icut.inf_req_dur_net)))
    val["inf_out_dur_net"].append(np.mean(icut.inf_out_dur_net/icut.inf_req_dur_net))
    err["inf_out_dur_net"].append(np.sqrt(np.var(icut.inf_out_dur_net/icut.inf_req_dur_net)))
    val["inf_req_dur_net"].append(np.mean(icut.inf_req_dur_net))
    err["inf_req_dur_net"].append(np.sqrt(np.var(icut.inf_req_dur_net)))
for k in val:
    val[k] = np.array(val[k])
for k in err:
    err[k] = np.array(err[k])
    
#for nm, mod in enumerate(unique_model_versions):
#    model = mod.split("/")[0]
#    ii = isort[getattr(isort, "rate_"+model) > 0]
#    plt.plot(getattr(isort, "num_instances_"+model), getattr(isort, "rate_"+model), c=cc[nm], label=model)
#plt.plot(isort.num_instances, isort.inf_req_dur_net, c=cc[0], label="Req. Dur.")
plt.plot(ninst, val["inf_que_dur_net"], c=cc[1], label="Que. Dur.")
plt.fill_between(ninst, val["inf_que_dur_net"]-err["inf_que_dur_net"], val["inf_que_dur_net"]+err["inf_que_dur_net"], color=cc[1], alpha=0.3)
plt.plot(ninst, val["inf_inp_dur_net"], c=cc[2], label="Inp. Dur.")
plt.fill_between(ninst, val["inf_inp_dur_net"]-err["inf_inp_dur_net"], val["inf_inp_dur_net"]+err["inf_inp_dur_net"], color=cc[2], alpha=0.3)
plt.plot(ninst, val["inf_inf_dur_net"], c=cc[3], label="Inf. Dur.")
plt.fill_between(ninst, val["inf_inf_dur_net"]-err["inf_inf_dur_net"], val["inf_inf_dur_net"]+err["inf_inf_dur_net"], color=cc[3], alpha=0.3)
plt.plot(ninst, val["inf_out_dur_net"], c=cc[4], label="Out. Dur.")
plt.fill_between(ninst, val["inf_out_dur_net"]-err["inf_out_dur_net"], val["inf_out_dur_net"]+err["inf_out_dur_net"], color=cc[4], alpha=0.3)
plt.ylabel("Fraction of Request Duration")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("duration_ratio_vs_servers.pdf")

In [None]:
plt.plot(ninst, val["inf_req_dur_net"], c=cc[5], label="Req. Dur.")
plt.fill_between(ninst, val["inf_req_dur_net"]-err["inf_req_dur_net"], val["inf_req_dur_net"]+err["inf_req_dur_net"], color=cc[5], alpha=0.3)
plt.ylabel("Request Duration [$\mu s$]")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("request_duration_vs_servers.pdf")

In [None]:
i0[(getattr(i0, "rate") > 0) & (getattr(i0, "num_instances") == act_srv)]

In [None]:
# Request duration as function of... servers, rate, both?
cc = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple", 
      "tab:brown", "tab:gray", "tab:olive", "tab:cyan", "tab:pink"]
iter_models = unique_model_versions.union({"All/1"})
print({mod:mod for mod in iter_models})
data   = {mod.split("/")[0]: [] for mod in iter_models}
points = {mod.split("/")[0]: [] for mod in iter_models}
colors = {mod.split("/")[0]: [] for mod in iter_models}
print(data.keys())
labels = []
for act_srv in range(1, 11):
    ii = i0[(getattr(i0, "rate") > 0) & (getattr(i0, "num_instances") == act_srv)]
    for nm, dur_type in enumerate(["req", "que", "inp", "inf", "out"]):
        if len(ii) == 0:
            #print("Skipping")
            continue
        data[model].append(getattr(ii, "rate_"+model))
        colors[model].append(cc[nm])
        points[model].append(act_srv)
    model = "All"
    ii = i0[(getattr(i0, "rate") > 0) & (getattr(i0, "num_instances") == act_srv)]
    if len(ii) == 0:
        continue
    data[model].append(getattr(ii, "rate"))
    colors[model].append(cc[-1])
    points[model].append(act_srv)

for mod in unique_model_versions:
    model = mod.split("/")[0]
    plt.violinplot(data[model], points[model])
#plt.scatter(i0.num_instances, i0.rate, c=cc[-1], label="All")
plt.ylabel("Inference Rate")
plt.xlabel("Active Triton Servers")
plt.legend()
plt.savefig("rate_vs_servers_violin_models.pdf")

In [None]:
# Violin plot of inference server frequency -> two modes?
#violin_parts_1 = plt.violinplot(i0.num_instances[i0.num_instances < 2])
#for pc in violin_parts_1['bodies']:
#    pc.set_facecolor('red')
#    pc.set_edgecolor('blue')
violin_parts_2 = plt.violinplot(i0.num_instances[i0.num_instances > 1])
for pc in violin_parts_2['bodies']:
    pc.set_facecolor('green')
    pc.set_edgecolor('blue')
plt.ylabel("Number of Active Triton Servers")
plt.xlabel("Relative Frequency")
plt.title("Active Triton Server Distribution")
plt.savefig("active_servers_except_one.pdf")

In [None]:
# Histogram a selection of distributions
from IPython.display import display, clear_output
hists = {}
for col in ["rate", "num_instances", "inf_reqs_net", 
            "inf_req_dur_net", "inf_que_dur_net", "inf_inp_dur_net", "inf_inf_dur_net", "inf_out_dur_net",
            "summed_rate", "summed_instances", "summed_gpu_tensor_util", "summed_gpu_dram_util",]:
    print(col)
    ii = i0[i0.rate > 0]
    x1, x2 = np.min(ii[col].values), np.max(ii[col].values)
    figure, ax = plt.subplots(figsize=(4,5))
    hists[col] = hist.Hist(hist.axis.Regular(10, x1, x2, name=col, label=col),
                           hist.storage.Double())
    hists[col].fill(ii[col].values)
    hists[col].plot(ax=ax)
    ax.set_ylabel("Frequency [120s subsampling]")
    ax.set_yscale("log")
    #display(figure)
    figure.savefig(col+"_hist.pdf")
    #clear_output(wait=True); plt.pause(0.1)


In [None]:
cc = ["tab:blue", "tab:green", "tab:red", "tab:orange", "tab:purple", "tab:gray", "tab:pink"]
ninst = [x for x in range(1, int(np.max(isort.num_instances)))]
fig, ax = plt.subplots(
    nrows=1,
    ncols=len(ninst),
    sharex=False,
    sharey=True,
    squeeze=True,
    #width_ratios=[1 for x in ninst],
    height_ratios=None,
    #subplot_kw=None,
    gridspec_kw={"width_ratios":[1 for x in ninst],
                 "wspace": 0,
                },
    figsize=(30, 15)
    #**fig_kw,
)
for nax, ats in enumerate(ninst):
    icut = isort[np.isclose(isort.num_instances,ats)]
    ax[nax].scatter(icut.inf_reqs_net, icut.rate/ats, color = "tab:blue", alpha=0.2, label="All")
    #imodel = isort[np.isclose(
    if nax == 0:
        ax[nax].set_ylabel(r"$\frac{<Inference Rate>}{Triton Server}$")
        ax[nax].set_xlim(0, 50)
    if ats == ninst[-1]:
        ax[nax].set_xlabel(r"$\frac{Inf. Req.}{Triton Server}$")
    ax[nax].legend()
fig.savefig("rates_vs_avgreqsnet_byinstances.pdf")

In [None]:
cc = ["tab:blue", "tab:green", "tab:red", "tab:orange", "tab:purple", "tab:gray", "tab:pink"]
ninst = [x for x in range(1, int(np.max(isort.num_instances)))]
fig, ax = plt.subplots(
    nrows=1,
    ncols=len(ninst),
    sharex=False,
    sharey=True,
    squeeze=True,
    #width_ratios=[1 for x in ninst],
    height_ratios=None,
    #subplot_kw=None,
    gridspec_kw={"width_ratios":[1 for x in ninst],
                 "wspace": 0,
                },
    figsize=(24, 8)
    #**fig_kw,
)
for nax, ats in enumerate(ninst):
    icut = isort[np.isclose(isort.num_instances,ats)]
    ax[nax].scatter(icut.inf_reqs_net/ats, icut.inf_req_dur_net, color = "tab:blue", alpha=0.2, label="Req.")
    ax[nax].scatter(icut.inf_reqs_net/ats, icut.inf_que_dur_net, color = "tab:red", alpha=0.05, label="Que.")
    ax[nax].scatter(icut.inf_reqs_net/ats, icut.inf_inf_dur_net, color = "tab:green", alpha=0.2, label="Inf.")
    #imodel = isort[np.isclose(
    if nax == 0:
        ax[nax].set_ylabel(r"$\frac{<Inf. Req. Dur.>}{Triton Server}$")
        ax[nax].set_ylim(0, 0.3*ax[nax].get_ylim()[1])
        ax[nax].set_xlim(0, 50)
    if ats == ninst[-1]:
        ax[nax].set_xlabel(r"$\frac{Inf. Req.}{Triton Server}$")
        ax[nax].legend()
fig.savefig("durations_vs_avgreqsnet_byinstances.pdf")

In [None]:
plt.scatter(i0.summed_gpu_tensor_util, i0.rate) #, 'summed_gpu_dram_util'

In [None]:
#plt.plot(i0.index, i0.summed_gpu_tensor_util)
#plt.plot(i0.index, i0.rate)

In [None]:
plt.scatter(i0.summed_gpu_dram_util, i0.rate)

In [None]:
import hist
h = hist.Hist(hist.axis.Regular(24, 0, 120000, name="rate"),
              hist.axis.Integer(1, 12, name="instances"),
              hist.storage.Double()
             )
h.fill(rate=i0.rate, instances=i0.num_instances)

In [None]:
h.values()

In [None]:
'inf_reqs_net', 'inf_req_dur_net',
       'inf_que_dur_net', 'inf_inp_dur_net', 'inf_inf_dur_net',
       'inf_out_dur_net'

In [None]:
# Save results in a pickle file for later
import pickle
with open(f"triton_metrics_test.pickle", "wb") as output_file:
    pickle.dump(i0, output_file)

## A few simple plots

In [None]:
plt.plot(i0.index.values, i0.rate.values)
scale_value = max(i0.rate.values)/max(i0.summed_gpu_tensor_util)
plt.plot(i0.index.values, i0.summed_gpu_tensor_util.values*scale_value, color="tab:red")

In [None]:
# Plot the rate versus number of instances, where at least 1 active instance is serving results
plt.scatter("num_instances", "rate", data=i0[i0.num_instances > 0], color="tab:red")

In [None]:
# Plot the same thing, but specific to the pn_demo model
plt.scatter("num_instances_pn_demo", "rate_pn_demo", data=i0[i0.num_instances > 0], color="tab:blue")

In [None]:
plt.scatter("num_instances_svj_tch_gnn", "rate_svj_tch_gnn", data=i0[i0.num_instances > 0], color="tab:green")

## Concurrency
How can we measure how many models are active per Triton server? The ```num_instances``` is how many actives servers there are.
The variables ```summed_instances``` is the sum of each model's active ```num_instances```. If the values are equal, then concurrency is low
(when defined as the number of ML models being run on an individual server). If ```summed_instances >> num_instances```, that indicates that each triton server is tending to actively serve requests from multiple models in a given timespan

In [None]:
#Concurrency question: if models tend to gravitate to their own instances, summed instances ~ num_instances
#If concurrency is as high as possible, summed instances ~ avg_num_models * num_instances
ii = i0[i0.num_instances > 0].summed_instances/i0[i0.num_instances > 0].num_instances
print(np.mean(ii), np.max(ii), np.min(ii))
print(np.sqrt(np.var(ii)))

#Consistency check: summed rate should always add to net rate!
kk = i0[i0.num_instances > 0].summed_rate/i0[i0.num_instances > 0].rate
print(np.mean(kk), np.max(kk), np.min(kk))
print(np.sqrt(np.var(kk)))