LLNL · slabasan · Jan 17, 2024 · Jun 20, 2023 · Jun 22, 2023 · Jun 22, 2023
diff --git a/src/docs/sphinx/VariorumAPI.rst b/src/docs/sphinx/VariorumAPI.rst
@@ -72,7 +72,7 @@ with the following keys:
 The "*" here refers to Socket ID. While more than one socket is supported, our
 test systems had only 2 sockets. Note that on the IBM Power9 platform, only the
 first socket (Chip-0) has the PWRSYS sensor, which directly reports total node
-power. Addtionally, both sockets here report CPU, Memory and GPU power.
+power. Additionally, both sockets here report CPU, Memory and GPU power.
 
 On Intel microarchitectures, total node power is not reported by hardware. As a
 result, total node power is estimated by adding CPU and DRAM power on both
@@ -115,6 +115,59 @@ string by reference and includes the following vendor-neutral keys:
 -  control_units (comma-separated string value)
 -  control_range (comma-separated string value)
 
+Obtaining Node Utilization
+==========================
+
+The API to obtain node utilization has the following format. It takes a string
+(``char**``) by reference as input, and populates this string with a JSON object
+with total CPU, system CPU, user CPU, total memory, and GPU (when available)
+utilizations. It reports the utilization of each available GPU. GPU utilization
+is accomplished using the ``int variorum_get_gpu_utilization_json(char
+**get_gpu_util_obj_str)`` function. The total memory utilization is computed
+using ``/proc/meminfo``, and CPU utilizations is computed using ``/proc/stat``.
+
+The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
+returns a string type nested JSON object. An example is provided below:
+
+.. code::
+
+   {
+       "hostname": {
+           "CPU": {
+               "total_util%": (Real),
+               "user_util%": (Real),
+               "system_util%": (Real),
+           },
+           "memory_util%": (Real),
+           "timestamp": (Integer),
+           "GPU": {
+               "Socket_*": {
+                   "GPUn*#_util%": (Integer)
+               }
+           }
+       }
+   }
+
+The ``*`` here refers to socket ID, and the ``#`` refers to GPU ID.
+
+The ``variorum_get_node_utilization_json(char **get_util_obj_str)`` function
+returns a string type nested JSON object. An example is provided below:
+
+.. code::
+
+   {
+       "hostname": {
+           "timestamp": (Integer),
+           "GPU": {
+               "Socket_*": {
+                   "GPUn*#_util%": (Integer)
+               }
+           }
+       }
+   }
+
+The ``*`` here refers to socket ID, and the ``#`` refers to GPU ID.
+
 ***************************
  Best Effort Power Capping
 ***************************

diff --git a/src/docs/sphinx/api/json_support_functions.rst b/src/docs/sphinx/api/json_support_functions.rst
@@ -19,3 +19,7 @@ Defined in ``variorum/variorum.h``.
 .. doxygenfunction:: variorum_get_thermals_json
 
 .. doxygenfunction:: variorum_get_node_frequency_json
+
+.. doxygenfunction:: variorum_get_node_utilization_json
+
+.. doxygenfunction:: variorum_get_gpu_utilization_json 
diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt
@@ -16,10 +16,12 @@ set(BASIC_EXAMPLES
     variorum-cap-socket-power-limit-example
     variorum-disable-turbo-example
     variorum-enable-turbo-example
+    variorum-get-gpu-utilization-json
     variorum-get-node-frequency-json-example
     variorum-get-node-power-domain-info-json-example
     variorum-get-node-power-json-example
     variorum-get-node-thermal-json-example
+    variorum-get-node-utilization-json
     variorum-get-topology-info-example
     variorum-integration-using-json-example
     variorum-monitoring-to-file-example

diff --git a/src/examples/variorum-get-gpu-utilization-json.c b/src/examples/variorum-get-gpu-utilization-json.c
@@ -0,0 +1,88 @@
+// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other
+// Variorum Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <variorum.h>
+#include <variorum_topology.h>
+
+#ifdef SECOND_RUN
+static inline double do_work(int input)
+{
+    int i;
+    double result = (double)input;
+
+    for (i = 0; i < 100000; i++)
+    {
+        result += i * result;
+    }
+
+    return result;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+    int ret;
+    char *s = NULL;
+#ifdef SECOND_RUN
+    int i;
+    int size = 1E4;
+    volatile double x = 0.0;
+#endif
+
+    const char *usage = "Usage: %s [-h] [-v]\n";
+    int opt;
+    while ((opt = getopt(argc, argv, "hv")) != -1)
+    {
+        switch (opt)
+        {
+            case 'h':
+                printf(usage, argv[0]);
+                return 0;
+            case 'v':
+                printf("%s\n", variorum_get_current_version());
+                return 0;
+            default:
+                fprintf(stderr, usage, argv[0]);
+                return -1;
+        }
+    }
+    ret = variorum_get_gpu_utilization_json(&s);
+    if (ret != 0)
+    {
+        printf("First run: JSON get node utilization failed!\n");
+        free(s);
+        exit(-1);
+    }
+
+    /* Print the entire JSON object */
+    puts(s);
+
+#ifdef SECOND_RUN
+    for (i = 0; i < size; i++)
+    {
+        x += do_work(i);
+    }
+    printf("Final result: %f\n", x);
+    ret = variorum_get_gpu_utilization_json(&s);
+    if (ret != 0)
+    {
+        printf("Second run: JSON get node utilization failed!\n");
+        free(s);
+        exit(-1);
+    }
+
+    /* Print the entire JSON object */
+    puts(s);
+#endif
+
+    /* Deallocate the string */
+    free(s);
+
+    return ret;
+}
diff --git a/src/examples/variorum-get-node-utilization-json.c b/src/examples/variorum-get-node-utilization-json.c
@@ -0,0 +1,88 @@
+// Copyright 2019-2023 Lawrence Livermore National Security, LLC and other
+// Variorum Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <variorum.h>
+#include <variorum_topology.h>
+
+#ifdef SECOND_RUN
+static inline double do_work(int input)
+{
+    int i;
+    double result = (double)input;
+
+    for (i = 0; i < 100000; i++)
+    {
+        result += i * result;
+    }
+
+    return result;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+    int ret;
+    char *s = NULL;
+#ifdef SECOND_RUN
+    int i;
+    int size = 1E4;
+    volatile double x = 0.0;
+#endif
+
+    const char *usage = "Usage: %s [-h] [-v]\n";
+    int opt;
+    while ((opt = getopt(argc, argv, "hv")) != -1)
+    {
+        switch (opt)
+        {
+            case 'h':
+                printf(usage, argv[0]);
+                return 0;
+            case 'v':
+                printf("%s\n", variorum_get_current_version());
+                return 0;
+            default:
+                fprintf(stderr, usage, argv[0]);
+                return -1;
+        }
+    }
+    ret = variorum_get_node_utilization_json(&s);
+    if (ret != 0)
+    {
+        printf("First run: JSON get node utilization failed!\n");
+        free(s);
+        exit(-1);
+    }
+
+    /* Print the entire JSON object */
+    puts(s);
+
+#ifdef SECOND_RUN
+    for (i = 0; i < size; i++)
+    {
+        x += do_work(i);
+    }
+    printf("Final result: %f\n", x);
+    ret = variorum_get_node_utilization_json(&s);
+    if (ret != 0)
+    {
+        printf("Second run: JSON get node utilization failed!\n");
+        free(s);
+        exit(-1);
+    }
+
+    /* Print the entire JSON object */
+    puts(s);
+#endif
+
+    /* Deallocate the string */
+    free(s);
+
+    return ret;
+}
diff --git a/src/variorum/AMD_GPU/amd_gpu_power_features.c b/src/variorum/AMD_GPU/amd_gpu_power_features.c
@@ -772,6 +772,106 @@ void get_gpu_utilization_data(int chipid, int total_sockets, int verbose,
     }
 }
 
+void get_gpu_utilization_data_json(int chipid, int total_sockets,
+                                   json_t *get_gpu_util_obj)
+{
+    rsmi_status_t ret;
+    uint32_t num_devices;
+    int gpus_per_socket;
+    int d = 0;
+    char socket_id[12];
+    char hostname[1024];
+    char device_id[12];
+    static int init = 0;
+    static struct timeval start;
+    struct timeval now;
+    struct timeval tv;
+    uint64_t ts;
+
+    gethostname(hostname, 1024);
+    gettimeofday(&tv, NULL);
+    ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec;
+
+    json_t *get_host_util_obj = json_object_get(get_gpu_util_obj, hostname);
+    if (get_host_util_obj == NULL)
+    {
+        get_host_util_obj = json_object();
+        json_object_set_new(get_gpu_util_obj, hostname, get_host_util_obj);
+    }
+
+    json_t *timestamp_obj = json_object_get(get_host_util_obj, "timestamp");
+    if (timestamp_obj == NULL)
+    {
+        json_object_set_new(get_host_util_obj, "timestamp", json_integer(ts));
+    }
+
+    json_t *gpu_obj = json_object_get(get_host_util_obj, "GPU");
+    if (gpu_obj == NULL)
+    {
+        gpu_obj = json_object();
+        json_object_set_new(get_host_util_obj, "GPU", gpu_obj);
+    }
+    snprintf(socket_id, 12, "Socket_%d", chipid);
+
+    json_t *socket_obj = json_object_get(gpu_obj, socket_id);
+    if (socket_obj == NULL)
+    {
+        socket_obj = json_object();
+        json_object_set_new(gpu_obj, socket_id, socket_obj);
+    }
+
+    ret = rsmi_init(0);
+    if (ret != RSMI_STATUS_SUCCESS)
+    {
+        variorum_error_handler("Could not initialize RSMI",
+                               VARIORUM_ERROR_PLATFORM_ENV,
+                               getenv("HOSTNAME"), __FILE__, __FUNCTION__,
+                               __LINE__);
+    }
+
+    ret = rsmi_num_monitor_devices(&num_devices);
+    if (ret != RSMI_STATUS_SUCCESS)
+    {
+        variorum_error_handler("Could not get number of GPU devices",
+                               VARIORUM_ERROR_PLATFORM_ENV,
+                               getenv("HOSTNAME"), __FILE__, __FUNCTION__,
+                               __LINE__);
+    }
+
+    gpus_per_socket = num_devices / total_sockets;
+
+    if (!init)
+    {
+        init = 1;
+        gettimeofday(&start, NULL);
+    }
+
+    gettimeofday(&now, NULL);
+    int i;
+    for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++)
+    {
+        uint32_t utilpercent = 0; // Percentage of time the GPU was busy
+        if (ret != RSMI_STATUS_SUCCESS)
+        {
+            variorum_error_handler("RSMI API was not successful",
+                                   VARIORUM_ERROR_PLATFORM_ENV,
+                                   getenv("HOSTNAME"), __FILE__, __FUNCTION__,
+                                   __LINE__);
+        }
+        snprintf(device_id, 12, "GPU%d_util%%", d);
+        json_object_set_new(socket_obj, device_id, json_integer(utilpercent));
+    }
+
+    ret = rsmi_shut_down();
+    if (ret != RSMI_STATUS_SUCCESS)
+    {
+        variorum_error_handler("Could not shutdown RSMI",
+                               VARIORUM_ERROR_PLATFORM_ENV,
+                               getenv("HOSTNAME"), __FILE__, __FUNCTION__,
+                               __LINE__);
+    }
+}
+
 void cap_each_gpu_power_limit(int chipid, int total_sockets,
                               unsigned int powerlimit)
 {
@@ -815,8 +915,8 @@ void cap_each_gpu_power_limit(int chipid, int total_sockets,
 
     gettimeofday(&now, NULL);
 
-    for (int i = chipid * gpus_per_socket;
-         i < (chipid + 1) * gpus_per_socket; i++)
+    int i;
+    for (i = chipid * gpus_per_socket; i < (chipid + 1) * gpus_per_socket; i++)
     {
         ret = rsmi_dev_power_cap_set(i, 0, powerlimit_uwatts);
         if (ret != RSMI_STATUS_SUCCESS)

diff --git a/src/variorum/AMD_GPU/amd_gpu_power_features.h b/src/variorum/AMD_GPU/amd_gpu_power_features.h
@@ -65,4 +65,10 @@ void get_clocks_json(
     json_t *output
 );
 
+void get_gpu_utilization_data_json(
+    int chipid,
+    int total_sockets,
+    json_t *get_gpu_util_obj
+);
+
 #endif