Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU Energy APIs #559

Draft
wants to merge 5 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/variorum/Nvidia_GPU/Volta.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,43 @@ int volta_get_power_json(json_t *get_power_obj)
return 0;
}

int volta_get_energy(int long_ver)
{
char *val = getenv("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

unsigned iter = 0;
unsigned nsockets = 0;
#ifdef VARIORUM_WITH_NVIDIA_GPU
variorum_get_topology(&nsockets, NULL, NULL, P_NVIDIA_GPU_IDX);
#endif
for (iter = 0; iter < nsockets; iter++)
{
nvidia_gpu_get_energy_data(iter, long_ver, stdout);
}
return 0;
}

int volta_get_energy_json(json_t *get_energy_obj)
{
char *val = getenv("VARIORUM_LOG");
if (val != NULL && atoi(val) == 1)
{
printf("Running %s\n", __FUNCTION__);
}

unsigned iter = 0;
unsigned nsockets;
variorum_get_topology(&nsockets, NULL, NULL, P_NVIDIA_GPU_IDX);

for (iter = 0; iter < nsockets; iter++)
{
nvidia_gpu_get_energy_json(iter, get_energy_obj);
}

return 0;
}

7 changes: 7 additions & 0 deletions src/variorum/Nvidia_GPU/Volta.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,11 @@ int volta_get_gpu_utilization_json(
char **get_gpu_util_obj_str
);

int volta_get_energy(
int long_ver
);

int volta_get_energy_json(
json_t *get_energy_obj_str
);
#endif
2 changes: 2 additions & 0 deletions src/variorum/Nvidia_GPU/config_nvidia.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ int set_nvidia_func_ptrs(int idx)
g_platform[idx].variorum_cap_each_gpu_power_limit =
volta_cap_each_gpu_power_limit;
g_platform[idx].variorum_get_power_json = volta_get_power_json;
g_platform[idx].variorum_print_energy = volta_get_energy;
g_platform[idx].variorum_get_energy_json = volta_get_energy_json;
}
else
{
Expand Down
102 changes: 102 additions & 0 deletions src/variorum/Nvidia_GPU/nvidia_gpu_power_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,105 @@ void nvidia_gpu_get_power_json(int chipid, json_t *get_power_obj)

}

void nvidia_gpu_get_energy_data(int chipid, int verbose, FILE *output)
{
unsigned long long energy;
double value = 0.0;
int d;
static int init_output = 0;

//Iterate over all GPU device handles for this socket and print power
for (d = chipid * (int)m_gpus_per_socket;
d < (chipid + 1) * (int)m_gpus_per_socket; ++d)
{
nvmlDeviceGetTotalEnergyConsumption(m_unit_devices_file_desc[d], &energy);
// Convert from milliJoules to Joules
value = (double)energy * 0.001f;

if (verbose)
{

fprintf(output, "%s: %s, %s: %d, %s: %d, %s: %lf W\n",
"_NVIDIA_GPU_ENERGY_USAGE Host", m_hostname,
"Socket", chipid,
"DeviceID", d, "Energy", value);
}
else
{
if (!init_output)
{
#ifdef LIBJUSTIFY_FOUND
cfprintf(output, "%s %s %s %s %s\n",
"_NVIDIA_GPU_ENERGY_USAGE", "Host",
"Socket", "DeviceID", "Energy");
#else
fprintf(output, "%s %s %s %s %s\n",
"_NVIDIA_GPU_ENERGY_USAGE", "Host",
"Socket", "DeviceID", "Energy");
#endif
init_output = 1;
}
#ifdef LIBJUSTIFY_FOUND
cfprintf(output, "%s %s %d %d %lf\n",
"_NVIDIA_GPU_ENERGY_USAGE", m_hostname, chipid, d, value);
#else
fprintf(output, "%s %s %d %d %lf\n",
"_NVIDIA_GPU_ENERGY_USAGE", m_hostname, chipid, d, value);

#endif
}
}
}

void nvidia_gpu_get_energy_json(int chipid, json_t *get_energy_obj)
{
unsigned long long gpu_energy;
double value = 0.0;
double total_gpu_energy = 0.0;
int d;
static size_t devIDlen = 24; // Long enough to avoid format truncation.
char devID[devIDlen];
char socket_id[12];
snprintf(socket_id, 12, "socket_%d", chipid);

json_object_set_new(get_energy_obj, "num_gpus_per_socket",
json_integer(m_gpus_per_socket));

//try to find socket object in node object, set new object if not found
json_t *socket_obj = json_object_get(get_energy_obj, socket_id);
if (socket_obj == NULL)
{
socket_obj = json_object();
json_object_set_new(get_energy_obj, socket_id, socket_obj);
}

//create new json object for GPU
json_t *gpu_obj = json_object();
json_object_set_new(socket_obj, "energy_gpu_joules", gpu_obj);

for (d = chipid * (int)m_gpus_per_socket;
d < (chipid + 1) * (int)m_gpus_per_socket; ++d)
{
nvmlDeviceGetTotalEnergyConsumption(m_unit_devices_file_desc[d], &gpu_energy);
value = (double)gpu_energy * 0.001f;
snprintf(devID, devIDlen, "GPU_%d", d);
json_object_set_new(gpu_obj, devID, json_real(value));
total_gpu_energy += value;
}

// If we have an existing CPU object with power_node_watts, update its value.
// Except on IBM Power9 systems, as they report node power with PWRSYS
// directly. So we don't need to add in the GPU values separately.

#ifndef VARIORUM_WITH_IBM_CPU
if (json_object_get(get_energy_obj, "energy_node_joules") != NULL)
{
double energy_node;
energy_node = json_real_value(json_object_get(get_energy_obj,
"energy_node_joules"));
json_object_set(get_energy_obj, "energy_node_joules",
json_real(energy_node + total_gpu_energy));
}
#endif

}
11 changes: 11 additions & 0 deletions src/variorum/Nvidia_GPU/nvidia_gpu_power_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,15 @@ void nvidia_gpu_get_power_json(
json_t *output
);

void nvidia_gpu_get_energy_data(
int chipid,
int verbose,
FILE *output
);

void nvidia_gpu_get_energy_json(
int chipid,
json_t *output
);

#endif
Loading
Loading