Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU and Node Utilization JSON API (CPU, GPU, Memory) #431

Merged
merged 49 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
a3142a9
initial mem util API
Jun 20, 2023
72538f0
moved cpu util API
Jun 22, 2023
8ef64f8
deleted unused variables
Jun 22, 2023
e39e32b
cpu util by /proc/stat, mem util by /proc/meminfo
Jul 2, 2023
5509a57
user, sys, and total CPU util
Jul 10, 2023
2981a17
fixed compile check errors
Jul 10, 2023
2fde5a5
fixed build check errors
Jul 10, 2023
740e32f
astyle format
Jul 10, 2023
f971c26
added iowait to idle
Jul 20, 2023
a21e4d2
gpu utilizations
Jul 22, 2023
5863eb1
indent json object
Jul 23, 2023
1b7766d
return -1 when arch not supported
Jul 26, 2023
65e1224
fixed typo
Jul 31, 2023
17743ee
nested json object
Aug 3, 2023
92f59e8
formatted
Aug 3, 2023
70c9443
gpu utilization keys with socket id and device id
Aug 8, 2023
da8c7c4
fixed typos
Aug 8, 2023
b96e218
added example
Oct 3, 2023
c6c0b68
edited docs
Oct 3, 2023
618a488
Merge branch 'dev' into add-util-API
altahat2003 Oct 10, 2023
dc90911
resolving conflicts
Oct 10, 2023
cf56fce
fixed astyle errors
Oct 10, 2023
78e3c7c
made gpu_utilization general
Oct 13, 2023
69195b6
fixed typo
Oct 13, 2023
b703618
fixed AMD gpu_util errors
Oct 13, 2023
e3cc09e
removed unused variable
Oct 13, 2023
2f56f68
fixed typos
Oct 13, 2023
7468865
check for errors
Oct 14, 2023
0ab29ca
formatted
Oct 14, 2023
30b0f3b
formatted
Oct 14, 2023
b9ba01d
added gpu_util docs and example
Oct 26, 2023
18d037b
formatted
Oct 26, 2023
17b76fd
formatted
Oct 26, 2023
1ddf673
changed gpu util json key
Oct 31, 2023
f01a7fc
formatted
Oct 31, 2023
111c5e5
Merge branch 'dev' into add-util-API
slabasan Jan 7, 2024
9333003
Merge branch 'dev' into add-util-API
slabasan Jan 15, 2024
05f7084
changes
slabasan Jan 15, 2024
e5d3977
astyle format
slabasan Jan 15, 2024
57a51e5
Merge remote-tracking branch 'origin/dev' into mohammadaltahat-add-ut…
slabasan Jan 17, 2024
c9d0998
fix warnings
slabasan Jan 17, 2024
c47b910
formatting
slabasan Jan 17, 2024
83de18c
Merge remote-tracking branch 'origin/dev' into mohammadaltahat-add-ut…
slabasan Jan 17, 2024
d56a88b
fix docs
slabasan Jan 17, 2024
d68aa28
cleanup
slabasan Jan 17, 2024
bd182f1
format docs
slabasan Jan 17, 2024
f6a7722
formatting
slabasan Jan 17, 2024
b36098e
Merge remote-tracking branch 'origin/dev' into mohammadaltahat-add-ut…
slabasan Jan 17, 2024
41e2cb6
astyle
slabasan Jan 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/variorum/IBM/Power9.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <config_architecture.h>
#include <Power9.h>
Expand Down Expand Up @@ -440,6 +441,42 @@ int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str)
return 0;
}

int ibm_cpu_p9_get_node_util_json(char **get_power_obj_str)
{
char *val = ("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

int ru;
char hostname[1024];
struct timeval tv;
struct rusage rusge;
uint64_t ts;
long mem;
json_t *get_util_obj = json_object();

gethostname(hostname, 1024);
gettimeofday(&tv, NULL);
ts = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec;
json_object_set_new(get_util_obj, "host", json_string(hostname));
json_object_set_new(get_util_obj, "timestamp", json_integer(ts));

ru = getrusage(RUSAGE_SELF, &rusge);
if (ru < 0)
{
printf("Failed to get utilizations\n");
return -1;
}

mem = rusge.ru_maxrss;
tpatki marked this conversation as resolved.
Show resolved Hide resolved
json_object_set_new(get_util_obj, "memory util", json_integer(mem));
*get_power_obj_str = json_dumps(get_util_obj, 0);
json_decref(get_util_obj);
return 0;
}

int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str)
{
char *val = ("VARIORUM_LOG");
Expand Down
2 changes: 2 additions & 0 deletions src/variorum/IBM/Power9.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ int ibm_cpu_p9_monitoring(FILE *output);

int ibm_cpu_p9_get_node_power_json(char **get_power_obj_str);

int ibm_cpu_p9_get_node_util_json(char **get_util_obj_str);

int ibm_cpu_p9_get_node_power_domain_info_json(char **get_domain_obj_str);

#endif
1 change: 1 addition & 0 deletions src/variorum/IBM/config_ibm.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ int set_ibm_func_ptrs(int idx)
g_platform[idx].variorum_cap_gpu_power_ratio = ibm_cpu_p9_cap_gpu_power_ratio;
g_platform[idx].variorum_monitoring = ibm_cpu_p9_monitoring;
g_platform[idx].variorum_get_node_power_json = ibm_cpu_p9_get_node_power_json;
g_platform[idx].variorum_get_node_util_json = ibm_cpu_p9_get_node_util_json;
g_platform[idx].variorum_get_node_power_domain_info_json =
ibm_cpu_p9_get_node_power_domain_info_json;
}
Expand Down
1 change: 0 additions & 1 deletion src/variorum/IBM/ibm_power_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ struct occ_sensor_counter
uint8_t pad[5];
} __attribute__((__packed__));


void print_power_sensors(int chipid,
int long_ver,
FILE *output,
Expand Down
3 changes: 1 addition & 2 deletions src/variorum/config_architecture.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ int variorum_enter(const char *filename, const char *func_name, int line_num)
printf("Number of registered platforms: %d\n", P_NUM_PLATFORMS);
}


variorum_init_func_ptrs();

//Triggers initialization on first call. Errors assert.
Expand Down Expand Up @@ -200,7 +199,6 @@ int variorum_detect_arch(void)
return 0;
}


void variorum_get_topology(unsigned *nsockets, unsigned *ncores,
unsigned *nthreads, int idx)
{
Expand Down Expand Up @@ -358,6 +356,7 @@ void variorum_init_func_ptrs()
g_platform[i].variorum_print_gpu_utilization = NULL;
g_platform[i].variorum_monitoring = NULL;
g_platform[i].variorum_get_node_power_json = NULL;
g_platform[i].variorum_get_node_util_json = NULL;
g_platform[i].variorum_get_node_power_domain_info_json = NULL;
g_platform[i].variorum_print_energy = NULL;
}
Expand Down
5 changes: 5 additions & 0 deletions src/variorum/config_architecture.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,11 @@ struct platform
/// @return Error code.
int (*variorum_get_node_power_json)(char **get_power_obj_str);

/// @brief Function pointer to get JSON object node utilization data.
///
/// @return Error code.
int (*variorum_get_node_util_json)(char **get_power_obj_str);

/// @brief Function pointer to get JSON object for power domain information.
///
/// @return Error code.
Expand Down
55 changes: 54 additions & 1 deletion src/variorum/variorum.c
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,6 @@ int variorum_disable_turbo(void)
return err;
}


// The variorum_get_node_power_json is a node-level API, and cannot be implemented
// at a per-component (eg CPU, GPU) level. This can only be captured by what we
// define as the 'primary' platform, e.g. IBM Power9 CPU or Intel and AMD CPUs,
Expand Down Expand Up @@ -1011,6 +1010,60 @@ int variorum_get_node_power_json(char **get_power_obj_str)
return err;
}

int variorum_get_node_util_json(char **get_util_obj_str)
{
int err = 0;
int i;
err = variorum_enter(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}

// Obtain the index corresponding to the primary platform.
for (i = 0; i < P_NUM_PLATFORMS; i++)
{
#ifdef VARIORUM_WITH_INTEL_CPU
i = P_INTEL_CPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_IBM_CPU
i = P_IBM_CPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_AMD_CPU
i = P_AMD_CPU_IDX;
break;
#endif
#ifdef VARIORUM_WITH_ARM_CPU
i = P_ARM_CPU_IDX;
break;
#endif
}

if (g_platform[i].variorum_get_node_util_json == NULL)
{
variorum_error_handler("Feature not yet implemented or is not supported",
VARIORUM_ERROR_FEATURE_NOT_IMPLEMENTED,
getenv("HOSTNAME"), __FILE__,
__FUNCTION__, __LINE__);
// For the JSON functions, we return a -1 here, so users don't need
// to explicitly check for NULL strings.
return -1;
}

err = g_platform[i].variorum_get_node_util_json(get_util_obj_str);
if (err)
{
return -1;
}
err = variorum_exit(__FILE__, __FUNCTION__, __LINE__);
if (err)
{
return -1;
}
return err;
}
// The variorum_get_node_power_domain_info_json is a node-level API, and cannot
// be implemented at a per-component (eg CPU, GPU) level. This can only be available
// on what we define as the 'primary' platform, e.g. IBM Power9 CPU or Intel and AMD CPUs,
Expand Down
9 changes: 9 additions & 0 deletions src/variorum/variorum.h
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,15 @@ int variorum_disable_turbo(void);
/// check for NULL strings.
int variorum_get_node_power_json(char **get_power_obj_str);

///@brief Populate a string in JSON format with total node utilization.
///
/// @supparcg
/// - IBM
///
/// @param [out] output String (passed by refrence) that contains node-level utilization information.
/// returns 0 if successful, otherwise -1.
int variorum_get_node_util_json(char **get_util_obj_str);

/// @brief Populate a string in JSON format with measurable and controllable
/// power domains, along with the ranges.
///
Expand Down