Permalink
Fetching contributors…
Cannot retrieve contributors at this time
645 lines (575 sloc) 16.3 KB
/*
* Copyright (C) 2006, Intel Corporation
* Copyright (C) 2012, Neil Horman <nhorman@tuxdriver.com>
*
* This file is part of irqbalance
*
* This program file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program in a file named COPYING; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/time.h>
#include <syslog.h>
#include <unistd.h>
#include <signal.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <fcntl.h>
#ifdef HAVE_GETOPT_LONG
#include <getopt.h>
#endif
#ifdef HAVE_LIBCAP_NG
#include <cap-ng.h>
#endif
#include "irqbalance.h"
volatile int keep_going = 1;
int socket_fd;
char socket_name[64];
int one_shot_mode;
int debug_mode;
int foreground_mode;
int numa_avail;
int journal_logging = 0;
int need_rescan;
unsigned int log_mask = TO_ALL;
const char *log_indent;
unsigned long power_thresh = ULONG_MAX;
unsigned long deepest_cache = 2;
unsigned long long cycle_count = 0;
char *pidfile = NULL;
char *polscript = NULL;
long HZ;
int sleep_interval = SLEEP_INTERVAL;
GMainLoop *main_loop;
char *banned_cpumask_from_ui = NULL;
static void sleep_approx(int seconds)
{
struct timespec ts;
struct timeval tv;
gettimeofday(&tv, NULL);
ts.tv_sec = seconds;
ts.tv_nsec = -tv.tv_usec*1000;
while (ts.tv_nsec < 0) {
ts.tv_sec--;
ts.tv_nsec += 1000000000;
}
nanosleep(&ts, NULL);
}
#ifdef HAVE_GETOPT_LONG
struct option lopts[] = {
{"oneshot", 0, NULL, 'o'},
{"debug", 0, NULL, 'd'},
{"foreground", 0, NULL, 'f'},
{"powerthresh", 1, NULL, 'p'},
{"banirq", 1 , NULL, 'i'},
{"deepestcache", 1, NULL, 'c'},
{"policyscript", 1, NULL, 'l'},
{"pid", 1, NULL, 's'},
{"journal", 0, NULL, 'j'},
{"banmod", 1 , NULL, 'm'},
{"interval", 1 , NULL, 't'},
{"version", 0, NULL, 'V'},
{0, 0, 0, 0}
};
static void usage(void)
{
log(TO_CONSOLE, LOG_INFO, "irqbalance [--oneshot | -o] [--debug | -d] [--foreground | -f] [--journal | -j]\n");
log(TO_CONSOLE, LOG_INFO, " [--powerthresh= | -p <off> | <n>] [--banirq= | -i <n>] [--banmod= | -m <module>] [--policyscript= | -l <script>]\n");
log(TO_CONSOLE, LOG_INFO, " [--pid= | -s <file>] [--deepestcache= | -c <n>] [--interval= | -t <n>]\n");
}
static void version(void)
{
log(TO_CONSOLE, LOG_INFO, "irqbalance version " VERSION "\n");
}
static void parse_command_line(int argc, char **argv)
{
int opt;
int longind;
unsigned long val;
while ((opt = getopt_long(argc, argv,
"odfji:p:s:c:l:m:t:V",
lopts, &longind)) != -1) {
switch(opt) {
case '?':
usage();
exit(1);
break;
case 'V':
version();
exit(1);
break;
case 'c':
deepest_cache = strtoul(optarg, NULL, 10);
if (deepest_cache == ULONG_MAX || deepest_cache < 1) {
usage();
exit(1);
}
break;
case 'd':
debug_mode=1;
foreground_mode=1;
break;
case 'f':
foreground_mode=1;
break;
case 'i':
val = strtoull(optarg, NULL, 10);
if (val == ULONG_MAX) {
usage();
exit(1);
}
add_cl_banned_irq((int)val);
break;
case 'l':
polscript = strdup(optarg);
break;
case 'm':
add_cl_banned_module(optarg);
break;
case 'p':
if (!strncmp(optarg, "off", strlen(optarg)))
power_thresh = ULONG_MAX;
else {
power_thresh = strtoull(optarg, NULL, 10);
if (power_thresh == ULONG_MAX) {
usage();
exit(1);
}
}
break;
case 'o':
one_shot_mode=1;
break;
case 's':
pidfile = optarg;
break;
case 'j':
journal_logging=1;
foreground_mode=1;
break;
case 't':
sleep_interval = strtol(optarg, NULL, 10);
if (sleep_interval < 1) {
usage();
exit(1);
}
break;
}
}
}
#endif
/*
* This builds our object tree. The Heirarchy is typically pretty
* straightforward.
* At the top are numa_nodes
* CPU packages belong to a single numa_node, unless the cache domains are in
* separate nodes. In that case, the cache domain's parent is the package, but
* the numa nodes point to the cache domains instead of the package as their
* children. This allows us to maintain the CPU hierarchy while adjusting for
* alternate memory topologies that are present on recent processor.
* All Cache domains belong to a CPU package
* All CPU cores belong to a cache domain
*
* Objects are built in that order (top down)
*
* Object workload is the aggregate sum of the
* workload of the objects below it
*/
static void build_object_tree(void)
{
build_numa_node_list();
parse_cpu_tree();
rebuild_irq_db();
}
static void free_object_tree(void)
{
free_numa_node_list();
clear_cpu_tree();
free_irq_db();
}
static void dump_object_tree(void)
{
for_each_object(numa_nodes, dump_numa_node_info, NULL);
}
static void force_rebalance_irq(struct irq_info *info, void *data __attribute__((unused)))
{
if (info->level == BALANCE_NONE)
return;
if (info->assigned_obj == NULL)
rebalance_irq_list = g_list_append(rebalance_irq_list, info);
else
migrate_irq(&info->assigned_obj->interrupts, &rebalance_irq_list, info);
info->assigned_obj = NULL;
}
gboolean handler(gpointer data __attribute__((unused)))
{
keep_going = 0;
g_main_loop_quit(main_loop);
return TRUE;
}
gboolean force_rescan(gpointer data __attribute__((unused)))
{
if (cycle_count)
need_rescan = 1;
return TRUE;
}
gboolean scan(gpointer data)
{
log(TO_CONSOLE, LOG_INFO, "\n\n\n-----------------------------------------------------------------------------\n");
clear_work_stats();
parse_proc_interrupts();
/* cope with cpu hotplug -- detected during /proc/interrupts parsing */
if (need_rescan) {
need_rescan = 0;
cycle_count = 0;
log(TO_CONSOLE, LOG_INFO, "Rescanning cpu topology \n");
clear_work_stats();
free_object_tree();
build_object_tree();
for_each_irq(NULL, force_rebalance_irq, NULL);
parse_proc_interrupts();
parse_proc_stat();
sleep_approx(sleep_interval);
clear_work_stats();
parse_proc_interrupts();
}
parse_proc_stat();
if (cycle_count)
update_migration_status();
calculate_placement();
activate_mappings();
if (debug_mode)
dump_tree();
if (one_shot_mode)
keep_going = 0;
cycle_count++;
if (data != &sleep_interval) {
data = &sleep_interval;
g_timeout_add_seconds(sleep_interval, scan, data);
return FALSE;
}
if (keep_going) {
return TRUE;
} else {
g_main_loop_quit(main_loop);
return FALSE;
}
}
void get_irq_data(struct irq_info *irq, void *data)
{
char **irqdata = (char **)data;
if (!*irqdata)
*irqdata = calloc(24 + 1 + 11 + 20 + 20 + 11, 1);
else
*irqdata = realloc(*irqdata, strlen(*irqdata) + 24 + 1 + 11 + 20 + 20 + 11);
sprintf(*irqdata + strlen(*irqdata),
"IRQ %d LOAD %lu DIFF %lu CLASS %d ", irq->irq, irq->load,
(irq->irq_count - irq->last_irq_count), irq->class);
}
void get_object_stat(struct topo_obj *object, void *data)
{
char **stats = (char **)data;
char *irq_data = NULL;
size_t irqdlen;
if (g_list_length(object->interrupts) > 0) {
for_each_irq(object->interrupts, get_irq_data, &irq_data);
}
irqdlen = irq_data ? strlen(irq_data) : 0;
/*
* Note, the size in both conditional branches below is made up as follows:
* strlen(irq_data) - self explanitory
* 31 - The size of "TYPE NUMBER LOAD SAVE_MODE "
* 11 - The maximal size of a %d printout
* 20 - The maximal size of a %lu printout
* 1 - The trailing string terminator
* This should be adjusted if the string in the sprintf is changed
*/
if (!*stats) {
*stats = calloc(irqdlen + 31 + 11 + 20 + 11 + 1, 1);
} else {
*stats = realloc(*stats, strlen(*stats) + irqdlen + 31 + 11 + 20 + 11 + 1);
}
sprintf(*stats + strlen(*stats), "TYPE %d NUMBER %d LOAD %lu SAVE_MODE %d %s",
object->obj_type, object->number, object->load,
object->powersave_mode, irq_data ? irq_data : "");
free(irq_data);
if (object->obj_type != OBJ_TYPE_CPU) {
for_each_object(object->children, get_object_stat, data);
}
}
gboolean sock_handle(gint fd, GIOCondition condition, gpointer user_data __attribute__((unused)))
{
char buff[500];
int sock;
int recv_size = 0;
int valid_user = 0;
struct iovec iov = { buff, 500 };
struct msghdr msg = { 0 };
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = malloc(CMSG_SPACE(sizeof(struct ucred)));
msg.msg_controllen = CMSG_SPACE(sizeof(struct ucred));
struct cmsghdr *cmsg;
if (condition == G_IO_IN) {
sock = accept(fd, NULL, NULL);
if (sock < 0) {
log(TO_ALL, LOG_WARNING, "Connection couldn't be accepted.\n");
goto out;
}
if ((recv_size = recvmsg(sock, &msg, 0)) < 0) {
log(TO_ALL, LOG_WARNING, "Error while receiving data.\n");
goto out_close;
}
cmsg = CMSG_FIRSTHDR(&msg);
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_CREDENTIALS)) {
struct ucred *credentials = (struct ucred *) CMSG_DATA(cmsg);
if (!credentials->uid) {
valid_user = 1;
}
}
if (!valid_user) {
log(TO_ALL, LOG_INFO, "Permission denied for user to connect to socket.\n");
goto out_close;
}
if (!strncmp(buff, "stats", strlen("stats"))) {
char *stats = NULL;
for_each_object(numa_nodes, get_object_stat, &stats);
send(sock, stats, strlen(stats), 0);
free(stats);
}
if (!strncmp(buff, "settings ", strlen("settings "))) {
if (!(strncmp(buff + strlen("settings "), "sleep ",
strlen("sleep ")))) {
char *sleep_string = malloc(
sizeof(char) * (recv_size - strlen("settings sleep ")));
strncpy(sleep_string, buff + strlen("settings sleep "),
recv_size - strlen("settings sleep "));
int new_iterval = strtoul(sleep_string, NULL, 10);
if (new_iterval >= 1) {
sleep_interval = new_iterval;
}
free(sleep_string);
} else if (!(strncmp(buff + strlen("settings "), "ban irqs ",
strlen("ban irqs ")))) {
char *end;
char *irq_string = malloc(
sizeof(char) * (recv_size - strlen("settings ban irqs ")));
strncpy(irq_string, buff + strlen("settings ban irqs "),
recv_size - strlen("settings ban irqs "));
g_list_free_full(cl_banned_irqs, free);
cl_banned_irqs = NULL;
need_rescan = 1;
if (!strncmp(irq_string, "NONE", strlen("NONE"))) {
free(irq_string);
goto out_close;
}
int irq = strtoul(irq_string, &end, 10);
do {
add_cl_banned_irq(irq);
} while((irq = strtoul(end, &end, 10)));
free(irq_string);
} else if (!(strncmp(buff + strlen("settings "), "cpus ",
strlen("cpus")))) {
char *cpu_ban_string = malloc(
sizeof(char) * (recv_size - strlen("settings cpus ")));
strncpy(cpu_ban_string, buff + strlen("settings cpus "),
recv_size - strlen("settings cpus "));
banned_cpumask_from_ui = strtok(cpu_ban_string, " ");
if (!strncmp(banned_cpumask_from_ui, "NULL", strlen("NULL"))) {
banned_cpumask_from_ui = NULL;
}
need_rescan = 1;
free(cpu_ban_string);
}
}
if (!strncmp(buff, "setup", strlen("setup"))) {
char banned[512];
char *setup = calloc(strlen("SLEEP ") + 11 + 1, 1);
snprintf(setup, strlen("SLEEP ") + 11 + 1, "SLEEP %d ", sleep_interval);
if(g_list_length(cl_banned_irqs) > 0) {
for_each_irq(cl_banned_irqs, get_irq_data, setup);
}
cpumask_scnprintf(banned, 512, banned_cpus);
setup = realloc(setup, strlen(setup) + strlen(banned) + 7 + 1);
snprintf(setup + strlen(setup), strlen(banned) + 7 + 1,
"BANNED %s", banned);
send(sock, setup, strlen(setup), 0);
free(setup);
}
out_close:
close(sock);
}
out:
free(msg.msg_control);
return TRUE;
}
int init_socket()
{
struct sockaddr_un addr;
memset(&addr, 0, sizeof(struct sockaddr_un));
socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0);
if (socket_fd < 0) {
log(TO_ALL, LOG_WARNING, "Socket couldn't be created.\n");
return 1;
}
/*
* First try to create a file-based socket in tmpfs. If that doesn't
* succeed, fall back to an abstract socket (non file-based).
*/
addr.sun_family = AF_UNIX;
snprintf(socket_name, 64, "%s/%s%d.sock", SOCKET_TMPFS, SOCKET_PATH, getpid());
strncpy(addr.sun_path, socket_name, strlen(socket_name));
if (bind(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
log(TO_ALL, LOG_WARNING, "Daemon couldn't be bound to the file-based socket.\n");
/* Try binding to abstract */
memset(&addr, 0, sizeof(struct sockaddr_un));
addr.sun_family = AF_UNIX;
if (bind(socket_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
log(TO_ALL, LOG_WARNING, "Daemon couldn't be bound to the abstract socket, bailing out.\n");
return 1;
}
}
int optval = 1;
if (setsockopt(socket_fd, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) < 0) {
log(TO_ALL, LOG_WARNING, "Unable to set socket options.\n");
return 1;
}
listen(socket_fd, 1);
g_unix_fd_add(socket_fd, G_IO_IN, sock_handle, NULL);
return 0;
}
int main(int argc, char** argv)
{
sigset_t sigset, old_sigset;
sigemptyset(&sigset);
sigaddset(&sigset,SIGINT);
sigaddset(&sigset,SIGHUP);
sigaddset(&sigset,SIGTERM);
sigaddset(&sigset,SIGUSR1);
sigaddset(&sigset,SIGUSR2);
sigprocmask(SIG_BLOCK, &sigset, &old_sigset);
#ifdef HAVE_GETOPT_LONG
parse_command_line(argc, argv);
#else /* ! HAVE_GETOPT_LONG */
if (argc>1 && strstr(argv[1],"--debug")) {
debug_mode=1;
foreground_mode=1;
}
if (argc>1 && strstr(argv[1],"--foreground"))
foreground_mode=1;
if (argc>1 && strstr(argv[1],"--oneshot"))
one_shot_mode=1;
if (argc>1 && strstr(argv[1],"--journal")) {
journal_logging=1;
foreground_mode=1;
}
#endif /* HAVE_GETOPT_LONG */
/*
* Open the syslog connection
*/
openlog(argv[0], 0, LOG_DAEMON);
if (getenv("IRQBALANCE_ONESHOT"))
one_shot_mode=1;
if (getenv("IRQBALANCE_DEBUG")) {
debug_mode=1;
foreground_mode=1;
}
/*
* If we are't in debug mode, don't dump anything to the console
* note that everything goes to the console before we check this
*/
if (journal_logging)
log_indent = "....";
else
log_indent = " ";
if (!debug_mode)
log_mask &= ~TO_CONSOLE;
if (numa_available() > -1) {
numa_avail = 1;
} else
log(TO_CONSOLE, LOG_INFO, "This machine seems not NUMA capable.\n");
if (geteuid() != 0)
log(TO_ALL, LOG_WARNING, "Irqbalance hasn't been executed under root privileges, thus it won't in fact balance interrupts.\n");
HZ = sysconf(_SC_CLK_TCK);
if (HZ == -1) {
log(TO_ALL, LOG_WARNING, "Unable to determin HZ defaulting to 100\n");
HZ = 100;
}
build_object_tree();
if (debug_mode)
dump_object_tree();
/* On single core UP systems irqbalance obviously has no work to do */
if (core_count<2) {
char *msg = "Balancing is ineffective on systems with a "
"single cpu. Shutting down\n";
log(TO_ALL, LOG_WARNING, "%s", msg);
exit(EXIT_SUCCESS);
}
if (!foreground_mode) {
int pidfd = -1;
if (daemon(0,0))
exit(EXIT_FAILURE);
/* Write pidfile */
if (pidfile && (pidfd = open(pidfile,
O_WRONLY | O_CREAT | O_EXCL | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) >= 0) {
char str[16];
snprintf(str, sizeof(str), "%u\n", getpid());
write(pidfd, str, strlen(str));
close(pidfd);
}
}
g_unix_signal_add(SIGINT, handler, NULL);
g_unix_signal_add(SIGTERM, handler, NULL);
g_unix_signal_add(SIGUSR1, handler, NULL);
g_unix_signal_add(SIGUSR2, handler, NULL);
g_unix_signal_add(SIGHUP, force_rescan, NULL);
sigprocmask(SIG_SETMASK, &old_sigset, NULL);
#ifdef HAVE_LIBCAP_NG
// Drop capabilities
capng_clear(CAPNG_SELECT_BOTH);
capng_lock();
capng_apply(CAPNG_SELECT_BOTH);
#endif
for_each_irq(NULL, force_rebalance_irq, NULL);
parse_proc_interrupts();
parse_proc_stat();
if (init_socket()) {
return EXIT_FAILURE;
}
main_loop = g_main_loop_new(NULL, FALSE);
int *last_interval = &sleep_interval;
g_timeout_add_seconds(sleep_interval, scan, last_interval);
g_main_loop_run(main_loop);
g_main_loop_quit(main_loop);
free_object_tree();
free_cl_opts();
/* Remove pidfile */
if (!foreground_mode && pidfile)
unlink(pidfile);
/* Remove socket */
close(socket_fd);
if (socket_name[0])
unlink(socket_name);
return EXIT_SUCCESS;
}