From 22b4331bc79b52deefbe2c80296e8f10214ad637 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 27 Jun 2024 21:49:58 +0000 Subject: [PATCH 1/3] Add Intel GPU Energy APIs --- src/variorum/Intel_GPU/GPU.c | 40 +++++ src/variorum/Intel_GPU/GPU.h | 11 ++ src/variorum/Intel_GPU/config_intel_gpu.c | 2 + .../Intel_GPU/intel_gpu_power_features.c | 97 ++++++++++++ .../Intel_GPU/intel_gpu_power_features.h | 13 ++ src/variorum/variorum.c | 147 ++++-------------- 6 files changed, 197 insertions(+), 113 deletions(-) diff --git a/src/variorum/Intel_GPU/GPU.c b/src/variorum/Intel_GPU/GPU.c index 5e5604f97..a371152e9 100644 --- a/src/variorum/Intel_GPU/GPU.c +++ b/src/variorum/Intel_GPU/GPU.c @@ -99,3 +99,43 @@ int intel_gpu_get_power_limit(int long_ver) } return 0; } + +int intel_gpu_get_energy(int long_ver) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + unsigned iter = 0; + unsigned nsockets = 0; +#ifdef VARIORUM_WITH_INTEL_GPU + variorum_get_topology(&nsockets, NULL, NULL, P_INTEL_GPU_IDX); +#endif + for (iter = 0; iter < nsockets; iter++) + { + get_energy_data(iter, long_ver, stdout); + } + return 0; +} + +int intel_gpu_get_energy_json(json_t *get_energy_obj) +{ + char *val = getenv("VARIORUM_LOG"); + if (val != NULL && atoi(val) == 1) + { + printf("Running %s\n", __FUNCTION__); + } + + unsigned iter = 0; + unsigned nsockets; + variorum_get_topology(&nsockets, NULL, NULL, P_INTEL_GPU_IDX); + + for (iter = 0; iter < nsockets; iter++) + { + get_energy_json(iter, get_energy_obj); + } + + return 0; +} diff --git a/src/variorum/Intel_GPU/GPU.h b/src/variorum/Intel_GPU/GPU.h index b36559d21..57fc01a8e 100644 --- a/src/variorum/Intel_GPU/GPU.h +++ b/src/variorum/Intel_GPU/GPU.h @@ -6,6 +6,8 @@ #ifndef INTEL_GPU_H_INCLUDE #define INTEL_GPU_H_INCLUDE +#include + extern int intel_gpu_get_power( int long_ver ); @@ -26,4 +28,13 @@ extern int intel_gpu_get_power_limit( int long_ver ); + +extern int intel_gpu_get_energy( + int long_ver +); + +extern int intel_gpu_get_energy_json( + json_t *get_energy_obj_str +); + #endif diff --git a/src/variorum/Intel_GPU/config_intel_gpu.c b/src/variorum/Intel_GPU/config_intel_gpu.c index fb1ae94c5..09c54d816 100644 --- a/src/variorum/Intel_GPU/config_intel_gpu.c +++ b/src/variorum/Intel_GPU/config_intel_gpu.c @@ -29,6 +29,8 @@ int set_intel_gpu_func_ptrs(int idx) g_platform[idx].variorum_cap_each_gpu_power_limit = intel_gpu_cap_each_gpu_power_limit; g_platform[idx].variorum_print_power_limit = intel_gpu_get_power_limit; + g_platform[idx].variorum_print_energy = intel_gpu_get_energy; + g_platform[idx].variorum_get_energy_json = intel_gpu_get_energy_json; } else { diff --git a/src/variorum/Intel_GPU/intel_gpu_power_features.c b/src/variorum/Intel_GPU/intel_gpu_power_features.c index bdbcedd65..36b20ac43 100644 --- a/src/variorum/Intel_GPU/intel_gpu_power_features.c +++ b/src/variorum/Intel_GPU/intel_gpu_power_features.c @@ -288,3 +288,100 @@ void get_power_limit_data(int chipid, int verbose, FILE *output) cflush(); #endif } + +void get_energy_data(int chipid, int verbose, FILE *output) +{ + uint64_t energy_uj; + double value = 0.0; + int d; + static int init_output = 0; + + //Iterate over all GPU device handles for this socket and print power + for (d = chipid * (int)m_gpus_per_socket; + d < (chipid + 1) * (int)m_gpus_per_socket; ++d) + { + int pi = 0; // only report the global power domain + apmidg_readenergy(d, pi, &energy_uj, NULL); + value = (double)energy_uj * 1.e-6; + + if (verbose) + { + fprintf(output, "%s: %s, %s: %d, %s: %d, %s: %lf J\n", + "_INTEL_GPU_ENERGY_USAGE Host", m_hostname, + "Socket", chipid, + "DeviceID", d, "Energy", value); + } + else + { + if (!init_output) + { +#ifdef LIBJUSTIFY_FOUND + cfprintf(output, "%s %s %s %s %s\n", + "_INTEL_GPU_ENERGY_USAGE", "Host", + "Socket", "DeviceID", "Energy"); +#else + fprintf(output, "%s %s %s %s %s\n", + "_INTEL_GPU_ENERGY_USAGE", "Host", + "Socket", "DeviceID", "Energy"); +#endif + init_output = 1; + } +#ifdef LIBJUSTIFY_FOUND + cfprintf(output, "%s %s %d %d %lf\n", + "_INTEL_GPU_ENERGY_USAGE", m_hostname, chipid, d, value); +#else + fprintf(output, "%s %s %d %d %lf\n", + "_INTEL_GPU_ENERGY_USAGE", m_hostname, chipid, d, value); + +#endif + } + } +} + +void get_energy_json(int chipid, json_t *get_energy_obj) +{ + uint64_t energy_uj; + double value = 0.0; + double total_energy_gpu = 0.0; + int d; + static size_t devIDlen = 24; // Long enough to avoid format truncation. + char devID[devIDlen]; + char socket_id[12]; + snprintf(socket_id, 12, "socket_%d", chipid); + + json_object_set_new(get_energy_obj, "num_gpus_per_socket", + json_integer(m_gpus_per_socket)); + + //try to find socket object in node object, set new object if not found + json_t *socket_obj = json_object_get(get_energy_obj, socket_id); + if (socket_obj == NULL) + { + socket_obj = json_object(); + json_object_set_new(get_energy_obj, socket_id, socket_obj); + } + + //create new json object for GPU + json_t *gpu_obj = json_object(); + json_object_set_new(socket_obj, "energy_gpu_joules", gpu_obj); + + for (d = chipid * (int)m_gpus_per_socket; + d < (chipid + 1) * (int)m_gpus_per_socket; ++d) + { + int pi = 0; // only report the global power domain + apmidg_readenergy(d, pi, &energy_uj, NULL); + value = (double)energy_uj * 1.e-6; + snprintf(devID, devIDlen, "GPU_%d", d); + json_object_set_new(gpu_obj, devID, json_real(value)); + total_energy_gpu += value; + } + + // If we have an existing CPU object with power_node_watts, update its value. + if (json_object_get(get_energy_obj, "energy_node_joules") != NULL) + { + double energy_node; + energy_node = json_real_value(json_object_get(get_energy_obj, + "energy_node_joules")); + json_object_set(get_energy_obj, "energy_node_joules", + json_real(energy_node + total_energy_gpu)); + } +} diff --git a/src/variorum/Intel_GPU/intel_gpu_power_features.h b/src/variorum/Intel_GPU/intel_gpu_power_features.h index 5f354c046..0725351c6 100644 --- a/src/variorum/Intel_GPU/intel_gpu_power_features.h +++ b/src/variorum/Intel_GPU/intel_gpu_power_features.h @@ -9,6 +9,8 @@ #include #include +#include + #include void initAPMIDG( @@ -48,4 +50,15 @@ void get_power_limit_data( FILE *output ); +void get_energy_data( + int chipid, + int verbose, + FILE *output +); + +void get_energy_json( + int chipid, + json_t *output +); + #endif diff --git a/src/variorum/variorum.c b/src/variorum/variorum.c index 92edea46a..81e13d4af 100644 --- a/src/variorum/variorum.c +++ b/src/variorum/variorum.c @@ -1573,52 +1573,26 @@ int variorum_print_energy(void) { int err = 0; int i; - int has_cpu = 0; - int has_gpu = 0; err = variorum_enter(__FILE__, __FUNCTION__, __LINE__); if (err) { return -1; } - // If we have a GPU-only build, we should exit with a helpful message. - // If we have a CPU-only or CPU+GPU multi-platform build, we should print - // the node-level energy. - // First check if we have a CPU platform, then check for a GPU platform - -#if defined(VARIORUM_WITH_INTEL_CPU) || defined(VARIORUM_WITH_AMD_CPU) || defined(VARIORUM_WITH_IBM_CPU) - has_cpu = 1; -#endif -#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU) - has_gpu = 1; -#endif - - // CPU-only or multi-platform build - if ((has_cpu && has_gpu) || (has_cpu)) + for (i = 0; i < P_NUM_PLATFORMS; i++) { - for (i = 0; i < P_NUM_PLATFORMS; i++) + if (g_platform[i].variorum_print_energy == NULL) { - if (g_platform[i].variorum_print_energy == NULL) - { - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return 0; - } - err = g_platform[i].variorum_print_energy(0); - if (err) - { - return -1; - } + variorum_error_handler("Feature not yet implemented or is not supported", + VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, + __FUNCTION__, __LINE__); + return 0; + } + err = g_platform[i].variorum_print_energy(0); + if (err) + { + return -1; } - } - else - { - // We have a GPU-only build, currently doesn't support get_energy - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return 0; } err = variorum_exit(__FILE__, __FUNCTION__, __LINE__); @@ -1633,52 +1607,26 @@ int variorum_print_verbose_energy(void) { int err = 0; int i; - int has_cpu = 0; - int has_gpu = 0; err = variorum_enter(__FILE__, __FUNCTION__, __LINE__); if (err) { return -1; } - // If we have a GPU-only build, we should exit with a helpful message. - // If we have a CPU-only or CPU+GPU multi-platform build, we should print - // the node-level energy. - // First check if we have a CPU platform, then check for a GPU platform - -#if defined(VARIORUM_WITH_INTEL_CPU) || defined(VARIORUM_WITH_AMD_CPU) || defined(VARIORUM_WITH_IBM_CPU) - has_cpu = 1; -#endif -#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU) - has_gpu = 1; -#endif - - // CPU-only or multi-platform build - if ((has_cpu && has_gpu) || (has_cpu)) + for (i = 0; i < P_NUM_PLATFORMS; i++) { - for (i = 0; i < P_NUM_PLATFORMS; i++) + if (g_platform[i].variorum_print_energy == NULL) { - if (g_platform[i].variorum_print_energy == NULL) - { - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return 0; - } - err = g_platform[i].variorum_print_energy(1); - if (err) - { - return -1; - } + variorum_error_handler("Feature not yet implemented or is not supported", + VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, + __FUNCTION__, __LINE__); + return 0; + } + err = g_platform[i].variorum_print_energy(1); + if (err) + { + return -1; } - } - else - { - // We have a GPU-only build, currently doesn't support get_energy - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return 0; } err = variorum_exit(__FILE__, __FUNCTION__, __LINE__); if (err) @@ -1692,8 +1640,6 @@ int variorum_get_energy_json(char **get_energy_obj_str) { int err = 0; int i; - int has_cpu = 0; - int has_gpu = 0; char hostname[1024]; uint64_t ts; struct timeval tv; @@ -1713,47 +1659,22 @@ int variorum_get_energy_json(char **get_energy_obj_str) ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; json_object_set_new(node_obj, "timestamp", json_integer(ts)); - // If we have a GPU-only build, we should exit with a helpful message. - // If we have a CPU-only or CPU+GPU multi-platform build, we should print - // the node-level energy. - // First check if we have a CPU platform, then check for a GPU platform - -#if defined(VARIORUM_WITH_INTEL_CPU) || defined(VARIORUM_WITH_AMD_CPU) || defined(VARIORUM_WITH_IBM_CPU) - has_cpu = 1; -#endif -#if defined(VARIORUM_WITH_NVIDIA_GPU) || defined(VARIORUM_WITH_AMD_GPU) || defined(VARIORUM_WITH_INTEL_GPU) - has_gpu = 1; -#endif - - // CPU-only or multi-platform build - if ((has_cpu && has_gpu) || (has_cpu)) + for (i = 0; i < P_NUM_PLATFORMS; i++) { - for (i = 0; i < P_NUM_PLATFORMS; i++) + if (g_platform[i].variorum_get_energy_json == NULL) { - if (g_platform[i].variorum_get_energy_json == NULL) - { - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, - getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); - return 0; - } - err = g_platform[i].variorum_get_energy_json(node_obj); - if (err) - { - printf("Error with variorum get frequency json platform %d\n", i); - } - *get_energy_obj_str = json_dumps(get_energy_obj, JSON_INDENT(4)); + variorum_error_handler("Feature not yet implemented or is not supported", + VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, + getenv("HOSTNAME"), __FILE__, + __FUNCTION__, __LINE__); + return 0; + } + err = g_platform[i].variorum_get_energy_json(node_obj); + if (err) + { + printf("Error with variorum get frequency json platform %d\n", i); } - } - else - { - // We have a GPU-only build, currently doesn't support get_energy - variorum_error_handler("Feature not yet implemented or is not supported", - VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED, getenv("HOSTNAME"), __FILE__, - __FUNCTION__, __LINE__); *get_energy_obj_str = json_dumps(get_energy_obj, JSON_INDENT(4)); - return 0; } json_decref(get_energy_obj); From a29ea1fbe9a116eea1b1b28702128fa99c438606 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Mon, 8 Jul 2024 21:22:47 +0000 Subject: [PATCH 2/3] Only report energy consumption since first reading --- .../Intel_GPU/intel_gpu_power_features.c | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/variorum/Intel_GPU/intel_gpu_power_features.c b/src/variorum/Intel_GPU/intel_gpu_power_features.c index 36b20ac43..dd566c516 100644 --- a/src/variorum/Intel_GPU/intel_gpu_power_features.c +++ b/src/variorum/Intel_GPU/intel_gpu_power_features.c @@ -20,6 +20,8 @@ static unsigned m_total_unit_devices; static unsigned m_gpus_per_socket; static char m_hostname[1024]; +static double *m_initial_energy_for_gpu; +static int *m_init_energy; void initAPMIDG(void) { @@ -35,6 +37,16 @@ void initAPMIDG(void) #endif m_gpus_per_socket = m_total_unit_devices / m_num_package; + static int init = 0; + if (!init) + { + m_initial_energy_for_gpu = (double *) malloc(sizeof(double) * + m_total_unit_devices); + m_init_energy = (int *) calloc(m_num_package, sizeof(int)); + + init = 1; + } + /* Save hostname */ gethostname(m_hostname, sizeof(m_hostname)); } @@ -302,7 +314,16 @@ void get_energy_data(int chipid, int verbose, FILE *output) { int pi = 0; // only report the global power domain apmidg_readenergy(d, pi, &energy_uj, NULL); - value = (double)energy_uj * 1.e-6; + if (!m_init_energy[chipid]) + { + m_initial_energy_for_gpu[d] = (double)energy_uj * 1.e-6; + value = 0; + } + else + { + value = (double)energy_uj * 1.e-6; + value -= m_initial_energy_for_gpu[d]; + } if (verbose) { @@ -336,6 +357,7 @@ void get_energy_data(int chipid, int verbose, FILE *output) #endif } } + m_init_energy[chipid] = 1; } void get_energy_json(int chipid, json_t *get_energy_obj) @@ -369,12 +391,23 @@ void get_energy_json(int chipid, json_t *get_energy_obj) { int pi = 0; // only report the global power domain apmidg_readenergy(d, pi, &energy_uj, NULL); - value = (double)energy_uj * 1.e-6; + if (!m_init_energy[chipid]) + { + m_initial_energy_for_gpu[d] = (double)energy_uj * 1.e-6; + value = 0; + } + else + { + value = (double)energy_uj * 1.e-6; + value -= m_initial_energy_for_gpu[d]; + } snprintf(devID, devIDlen, "GPU_%d", d); json_object_set_new(gpu_obj, devID, json_real(value)); total_energy_gpu += value; } + m_init_energy[chipid] = 1; + // If we have an existing CPU object with power_node_watts, update its value. if (json_object_get(get_energy_obj, "energy_node_joules") != NULL) { From 47702b44ed59decbe152c942760cb4a42bbb4b3c Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 11 Jul 2024 13:48:19 +0000 Subject: [PATCH 3/3] Use atexit hook to avoid leaking memory --- src/variorum/Intel_GPU/intel_gpu_power_features.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/variorum/Intel_GPU/intel_gpu_power_features.c b/src/variorum/Intel_GPU/intel_gpu_power_features.c index dd566c516..ed365d6f9 100644 --- a/src/variorum/Intel_GPU/intel_gpu_power_features.c +++ b/src/variorum/Intel_GPU/intel_gpu_power_features.c @@ -23,6 +23,12 @@ static char m_hostname[1024]; static double *m_initial_energy_for_gpu; static int *m_init_energy; +void releaseInitialEnergyForGPU() +{ + free(m_initial_energy_for_gpu); + free(m_init_energy); +} + void initAPMIDG(void) { int verbose = 0; @@ -43,7 +49,7 @@ void initAPMIDG(void) m_initial_energy_for_gpu = (double *) malloc(sizeof(double) * m_total_unit_devices); m_init_energy = (int *) calloc(m_num_package, sizeof(int)); - + atexit(releaseInitialEnergyForGPU); init = 1; }