Permalink
Fetching contributors…
Cannot retrieve contributors at this time
876 lines (728 sloc) 18.4 KB
#include "config.h"
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <assert.h>
#include <errno.h>
#include "irqbalance.h"
#include "types.h"
char *classes[] = {
"other",
"legacy",
"storage",
"video",
"ethernet",
"gbit-ethernet",
"10gbit-ethernet",
"virt-event",
0
};
static int map_class_to_level[8] =
{ BALANCE_PACKAGE, BALANCE_CACHE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE, BALANCE_CORE };
struct user_irq_policy {
int ban;
int level;
int numa_node_set;
int numa_node;
};
static GList *interrupts_db = NULL;
static GList *banned_irqs = NULL;
GList *cl_banned_irqs = NULL;
static GList *cl_banned_modules = NULL;
#define SYSFS_DIR "/sys"
#define SYSPCI_DIR "/sys/bus/pci/devices"
#define PCI_MAX_CLASS 0x14
#define PCI_MAX_SERIAL_SUBCLASS 0x81
#define PCI_INVAL_DATA 0xFFFFFFFF
struct pci_info {
unsigned short vendor;
unsigned short device;
unsigned short sub_vendor;
unsigned short sub_device;
unsigned int class;
};
/* PCI vendor ID, device ID */
#define PCI_VENDOR_PLX 0x10b5
#define PCI_DEVICE_PLX_PEX8619 0x8619
#define PCI_VENDOR_CAVIUM 0x177d
#define PCI_DEVICE_CAVIUM_CN61XX 0x0093
/* PCI subsystem vendor ID, subsystem device ID */
#define PCI_SUB_VENDOR_EMC 0x1120
#define PCI_SUB_DEVICE_EMC_055B 0x055b
#define PCI_SUB_DEVICE_EMC_0568 0x0568
#define PCI_SUB_DEVICE_EMC_dd00 0xdd00
/*
* Apply software workarounds for some special devices
*
* The world is not perfect and supplies us with broken PCI devices.
* Usually there are two sort of cases:
*
* 1. The device is special
* Before shipping the devices, PCI spec doesn't have the definitions.
*
* 2. Buggy PCI devices
* Some PCI devices don't follow the PCI class code definitions.
*/
static void apply_pci_quirks(const struct pci_info *pci, int *irq_class)
{
if ((pci->vendor == PCI_VENDOR_PLX) &&
(pci->device == PCI_DEVICE_PLX_PEX8619) &&
(pci->sub_vendor == PCI_SUB_VENDOR_EMC)) {
switch (pci->sub_device) {
case PCI_SUB_DEVICE_EMC_055B:
case PCI_SUB_DEVICE_EMC_dd00:
*irq_class = IRQ_SCSI;
break;
}
}
if ((pci->vendor == PCI_VENDOR_CAVIUM) &&
(pci->device == PCI_DEVICE_CAVIUM_CN61XX) &&
(pci->sub_vendor == PCI_SUB_VENDOR_EMC)) {
switch (pci->sub_device) {
case PCI_SUB_DEVICE_EMC_0568:
*irq_class = IRQ_SCSI;
break;
}
}
return;
}
/* Determin IRQ class based on PCI class code */
static int map_pci_irq_class(unsigned int pci_class)
{
unsigned int major = pci_class >> 16;
unsigned int sub = (pci_class & 0xFF00) >> 8;
int irq_class = IRQ_NODEF;
/*
* Class codes lifted from below PCI-SIG spec:
*
* PCI Code and ID Assignment Specification v1.5
*
* and mapped to irqbalance types here.
*
* IRQ_NODEF will go through classification by PCI sub-class code.
*/
static short major_class_codes[PCI_MAX_CLASS] = {
IRQ_OTHER,
IRQ_SCSI,
IRQ_ETH,
IRQ_VIDEO,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_OTHER,
IRQ_OTHER,
IRQ_NODEF,
IRQ_ETH,
IRQ_SCSI,
IRQ_OTHER,
IRQ_OTHER,
IRQ_OTHER,
IRQ_LEGACY,
IRQ_LEGACY,
};
/*
* All sub-class code for serial bus controllers.
* The major class code is 0xc.
*/
static short serial_sub_codes[PCI_MAX_SERIAL_SUBCLASS] = {
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_SCSI,
IRQ_LEGACY,
IRQ_SCSI,
IRQ_LEGACY,
IRQ_LEGACY,
IRQ_LEGACY,
[0xa ... 0x7f] = IRQ_NODEF,
IRQ_LEGACY,
};
/*
* Check major class code first
*/
if (major >= PCI_MAX_CLASS)
return IRQ_NODEF;
switch (major) {
case 0xc: /* Serial bus class */
if (sub >= PCI_MAX_SERIAL_SUBCLASS)
return IRQ_NODEF;
irq_class = serial_sub_codes[sub];
break;
default: /* All other PCI classes */
irq_class = major_class_codes[major];
break;
}
return irq_class;
}
/* Read specific data from sysfs */
static unsigned int read_pci_data(const char *devpath, const char* file)
{
char path[PATH_MAX];
FILE *fd;
unsigned int data = PCI_INVAL_DATA;
sprintf(path, "%s/%s", devpath, file);
fd = fopen(path, "r");
if (!fd) {
log(TO_CONSOLE, LOG_WARNING, "PCI: can't open file:%s\n", path);
return data;
}
(void) fscanf(fd, "%x", &data);
fclose(fd);
return data;
}
/* Get pci information for IRQ classification */
static int get_pci_info(const char *devpath, struct pci_info *pci)
{
unsigned int data = PCI_INVAL_DATA;
if ((data = read_pci_data(devpath, "vendor")) == PCI_INVAL_DATA)
return -ENODEV;
pci->vendor = (unsigned short)data;
if ((data = read_pci_data(devpath, "device")) == PCI_INVAL_DATA)
return -ENODEV;
pci->device = (unsigned short)data;
if ((data = read_pci_data(devpath, "subsystem_vendor")) == PCI_INVAL_DATA)
return -ENODEV;
pci->sub_vendor = (unsigned short)data;
if ((data = read_pci_data(devpath, "subsystem_device")) == PCI_INVAL_DATA)
return -ENODEV;
pci->sub_device = (unsigned short)data;
if ((data = read_pci_data(devpath, "class")) == PCI_INVAL_DATA)
return -ENODEV;
pci->class = data;
return 0;
}
/* Return IRQ class for given devpath */
static int get_irq_class(const char *devpath)
{
int irq_class = IRQ_NODEF;
struct pci_info pci;
/* Get PCI info from sysfs */
if (get_pci_info(devpath, &pci) < 0)
return IRQ_NODEF;
/* Map PCI class code to irq class */
irq_class = map_pci_irq_class(pci.class);
if (irq_class < 0) {
log(TO_CONSOLE, LOG_WARNING, "Invalid PCI class code %d\n",
pci.class);
return IRQ_NODEF;
}
/* Reassign irq class for some buggy devices */
apply_pci_quirks(&pci, &irq_class);
return irq_class;
}
static gint compare_ints(gconstpointer a, gconstpointer b)
{
const struct irq_info *ai = a;
const struct irq_info *bi = b;
return ai->irq - bi->irq;
}
static void add_banned_irq(int irq, GList **list)
{
struct irq_info find, *new;
GList *entry;
find.irq = irq;
entry = g_list_find_custom(*list, &find, compare_ints);
if (entry)
return;
new = calloc(sizeof(struct irq_info), 1);
if (!new) {
log(TO_CONSOLE, LOG_WARNING, "No memory to ban irq %d\n", irq);
return;
}
new->irq = irq;
new->flags |= IRQ_FLAG_BANNED;
*list = g_list_append(*list, new);
log(TO_CONSOLE, LOG_INFO, "IRQ %d was BANNED.\n", irq);
return;
}
void add_cl_banned_irq(int irq)
{
add_banned_irq(irq, &cl_banned_irqs);
}
static int is_banned_irq(int irq)
{
GList *entry;
struct irq_info find;
find.irq = irq;
entry = g_list_find_custom(banned_irqs, &find, compare_ints);
return entry ? 1:0;
}
gint substr_find(gconstpointer a, gconstpointer b)
{
if (strstr(b, a))
return 0;
else
return 1;
}
static void add_banned_module(char *modname, GList **modlist)
{
GList *entry;
char *newmod;
entry = g_list_find_custom(*modlist, modname, substr_find);
if (entry)
return;
newmod = strdup(modname);
if (!newmod) {
log(TO_CONSOLE, LOG_WARNING, "No memory to ban module %s\n", modname);
return;
}
*modlist = g_list_append(*modlist, newmod);
}
void add_cl_banned_module(char *modname)
{
add_banned_module(modname, &cl_banned_modules);
}
/*
* Inserts an irq_info struct into the intterupts_db list
* devpath points to the device directory in sysfs for the
* related device. NULL devpath means no sysfs entries for
* this irq.
*/
static struct irq_info *add_one_irq_to_db(const char *devpath, int irq, struct user_irq_policy *pol)
{
int irq_class = IRQ_OTHER;
struct irq_info *new, find;
int numa_node;
char path[PATH_MAX];
FILE *fd;
char *lcpu_mask;
GList *entry;
ssize_t ret;
size_t blen;
/*
* First check to make sure this isn't a duplicate entry
*/
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (entry) {
log(TO_CONSOLE, LOG_INFO, "DROPPING DUPLICATE ENTRY FOR IRQ %d on path %s\n", irq, devpath);
return NULL;
}
if (is_banned_irq(irq)) {
log(TO_ALL, LOG_INFO, "SKIPPING BANNED IRQ %d\n", irq);
return NULL;
}
new = calloc(sizeof(struct irq_info), 1);
if (!new)
return NULL;
new->irq = irq;
new->class = IRQ_OTHER;
interrupts_db = g_list_append(interrupts_db, new);
/* Some special irqs have NULL devpath */
if (devpath != NULL) {
/* Map PCI class code to irq class */
irq_class = get_irq_class(devpath);
if (irq_class < 0)
goto get_numa_node;
}
new->class = irq_class;
if (pol->level >= 0)
new->level = pol->level;
else
new->level = map_class_to_level[irq_class];
get_numa_node:
numa_node = -1;
if (numa_avail) {
sprintf(path, "%s/numa_node", devpath);
fd = fopen(path, "r");
if (fd) {
fscanf(fd, "%d", &numa_node);
fclose(fd);
}
}
if (pol->numa_node_set == 1)
new->numa_node = get_numa_node(pol->numa_node);
else
new->numa_node = get_numa_node(numa_node);
sprintf(path, "%s/local_cpus", devpath);
fd = fopen(path, "r");
if (!fd) {
cpus_setall(new->cpumask);
goto out;
}
lcpu_mask = NULL;
ret = getline(&lcpu_mask, &blen, fd);
fclose(fd);
if (ret <= 0) {
cpus_setall(new->cpumask);
} else {
cpumask_parse_user(lcpu_mask, ret, new->cpumask);
}
free(lcpu_mask);
out:
log(TO_CONSOLE, LOG_INFO, "Adding IRQ %d to database\n", irq);
return new;
}
static void parse_user_policy_key(char *buf, int irq, struct user_irq_policy *pol)
{
char *key, *value, *end;
char *levelvals[] = { "none", "package", "cache", "core" };
int idx;
int key_set = 1;
key = buf;
value = strchr(buf, '=');
if (!value) {
log(TO_SYSLOG, LOG_WARNING, "Bad format for policy, ignoring: %s\n", buf);
return;
}
/* NULL terminate the key and advance value to the start of the value
* string
*/
*value = '\0';
value++;
end = strchr(value, '\n');
if (end)
*end = '\0';
if (!strcasecmp("ban", key)) {
if (!strcasecmp("false", value))
pol->ban = 0;
else if (!strcasecmp("true", value))
pol->ban = 1;
else {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Unknown value for ban policy: %s\n", value);
}
} else if (!strcasecmp("balance_level", key)) {
for (idx=0; idx<4; idx++) {
if (!strcasecmp(levelvals[idx], value))
break;
}
if (idx>3) {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Bad value for balance_level policy: %s\n", value);
} else
pol->level = idx;
} else if (!strcasecmp("numa_node", key)) {
idx = strtoul(value, NULL, 10);
if (!get_numa_node(idx)) {
log(TO_ALL, LOG_WARNING, "NUMA node %d doesn't exist\n",
idx);
return;
}
pol->numa_node = idx;
pol->numa_node_set = 1;
} else {
key_set = 0;
log(TO_ALL, LOG_WARNING, "Unknown key returned, ignoring: %s\n", key);
}
if (key_set)
log(TO_ALL, LOG_INFO, "IRQ %d: Override %s to %s\n", irq, key, value);
}
static int run_script_for_policy(char *script, char *path, int irq, struct user_irq_policy *pol)
{
char *cmd;
char *brc;
FILE *output;
char buffer[128];
cmd = alloca(strlen(path)+strlen(script)+64);
if (!cmd)
return -1;
sprintf(cmd, "exec %s %s %d", script, path, irq);
output = popen(cmd, "r");
if (!output) {
log(TO_ALL, LOG_WARNING, "Unable to execute user policy script %s\n", script);
return 1; /* tell caller to ignore this script */
}
while(!feof(output)) {
brc = fgets(buffer, 128, output);
if (brc)
parse_user_policy_key(brc, irq, pol);
}
return WEXITSTATUS(pclose(output));
}
/*
* Calls out to a possibly user defined script to get user assigned policy
* aspects for a given irq. A value of -1 in a given field indicates no
* policy was given and that system defaults should be used
*/
static void get_irq_user_policy(char *path, int irq, struct user_irq_policy *pol)
{
struct stat sbuf;
DIR *poldir;
struct dirent *entry;
int ret;
char script[1024];
memset(pol, -1, sizeof(struct user_irq_policy));
/* Return defaults if no script was given */
if (!polscript)
return;
if (stat(polscript, &sbuf))
return;
/* Use SYSFS_DIR for irq has no sysfs entries */
if (!path)
path = SYSFS_DIR;
if (!S_ISDIR(sbuf.st_mode)) {
if (run_script_for_policy(polscript, path, irq, pol) != 0) {
log(TO_CONSOLE, LOG_ERR, "policy script returned non-zero code! skipping user policy\n");
memset(pol, -1, sizeof(struct user_irq_policy));
}
} else {
/* polscript is a directory, user multiple script semantics */
poldir = opendir(polscript);
if (poldir) {
while ((entry = readdir(poldir)) != NULL) {
snprintf(script, sizeof(script), "%s/%s", polscript, entry->d_name);
if (stat(script, &sbuf))
continue;
if (S_ISREG(sbuf.st_mode)) {
if (!(sbuf.st_mode & S_IXUSR)) {
log(TO_CONSOLE, LOG_DEBUG, "Skipping script %s due to lack of executable permission\n", script);
continue;
}
memset(pol, -1, sizeof(struct user_irq_policy));
ret = run_script_for_policy(script, path, irq, pol);
if ((ret < 0) || (ret >= 2)) {
log(TO_CONSOLE, LOG_ERR, "Error executing policy script %s : %d\n", script, ret);
continue;
}
/* a ret of 1 means this script isn't
* for this irq
*/
if (ret == 1)
continue;
log(TO_CONSOLE, LOG_DEBUG, "Accepting script %s to define policy for irq %d\n", script, irq);
break;
}
}
}
}
}
static int check_for_module_ban(char *name)
{
GList *entry;
entry = g_list_find_custom(cl_banned_modules, name, substr_find);
if (entry)
return 1;
else
return 0;
}
static int check_for_irq_ban(char *path __attribute__((unused)), int irq, GList *proc_interrupts)
{
struct irq_info find, *res;
GList *entry;
/*
* Check to see if we banned this irq on the command line
*/
find.irq = irq;
entry = g_list_find_custom(cl_banned_irqs, &find, compare_ints);
if (entry)
return 1;
/*
* Check to see if we banned module which the irq belongs to.
*/
entry = g_list_find_custom(proc_interrupts, &find, compare_ints);
if (entry) {
res = entry->data;
if (check_for_module_ban(res->name))
return 1;
}
return 0;
}
/*
* Figures out which interrupt(s) relate to the device we"re looking at in dirname
*/
static void build_one_dev_entry(const char *dirname, GList *tmp_irqs)
{
struct dirent *entry;
DIR *msidir;
FILE *fd;
int irqnum;
struct irq_info *new;
char path[PATH_MAX];
char devpath[PATH_MAX];
struct user_irq_policy pol;
sprintf(path, "%s/%s/msi_irqs", SYSPCI_DIR, dirname);
sprintf(devpath, "%s/%s", SYSPCI_DIR, dirname);
msidir = opendir(path);
if (msidir) {
do {
entry = readdir(msidir);
if (!entry)
break;
irqnum = strtol(entry->d_name, NULL, 10);
if (irqnum) {
new = get_irq_info(irqnum);
if (new)
continue;
get_irq_user_policy(devpath, irqnum, &pol);
if ((pol.ban == 1) || (check_for_irq_ban(devpath, irqnum, tmp_irqs))) {
add_banned_irq(irqnum, &banned_irqs);
continue;
}
new = add_one_irq_to_db(devpath, irqnum, &pol);
if (!new)
continue;
new->type = IRQ_TYPE_MSIX;
}
} while (entry != NULL);
closedir(msidir);
return;
}
sprintf(path, "%s/%s/irq", SYSPCI_DIR, dirname);
fd = fopen(path, "r");
if (!fd)
return;
if (fscanf(fd, "%d", &irqnum) < 0)
goto done;
/*
* no pci device has irq 0
* irq 255 is invalid on x86/x64 architectures
*/
#if defined(__i386__) || defined(__x86_64__)
if (irqnum && irqnum != 255) {
#else
if (irqnum) {
#endif
new = get_irq_info(irqnum);
if (new)
goto done;
get_irq_user_policy(devpath, irqnum, &pol);
if ((pol.ban == 1) || (check_for_irq_ban(path, irqnum, tmp_irqs))) {
add_banned_irq(irqnum, &banned_irqs);
goto done;
}
new = add_one_irq_to_db(devpath, irqnum, &pol);
if (!new)
goto done;
new->type = IRQ_TYPE_LEGACY;
}
done:
fclose(fd);
return;
}
static void free_irq(struct irq_info *info, void *data __attribute__((unused)))
{
free(info);
}
void free_irq_db(void)
{
for_each_irq(NULL, free_irq, NULL);
g_list_free(interrupts_db);
interrupts_db = NULL;
for_each_irq(banned_irqs, free_irq, NULL);
g_list_free(banned_irqs);
banned_irqs = NULL;
g_list_free(rebalance_irq_list);
rebalance_irq_list = NULL;
}
void free_cl_opts(void)
{
g_list_free_full(cl_banned_modules, free);
g_list_free_full(cl_banned_irqs, free);
g_list_free(banned_irqs);
}
static void add_new_irq(int irq, struct irq_info *hint, GList *proc_interrupts)
{
struct irq_info *new;
struct user_irq_policy pol;
new = get_irq_info(irq);
if (new)
return;
/* Set NULL devpath for the irq has no sysfs entries */
get_irq_user_policy(NULL, irq, &pol);
if ((pol.ban == 1) || check_for_irq_ban(NULL, irq, proc_interrupts)) { /*FIXME*/
add_banned_irq(irq, &banned_irqs);
new = get_irq_info(irq);
} else
new = add_one_irq_to_db(NULL, irq, &pol);
if (!new) {
log(TO_CONSOLE, LOG_WARNING, "add_new_irq: Failed to add irq %d\n", irq);
return;
}
/*
* Override some of the new irq defaults here
*/
if (hint) {
new->type = hint->type;
new->class = hint->class;
}
new->level = map_class_to_level[new->class];
}
static void add_missing_irq(struct irq_info *info, void *attr)
{
struct irq_info *lookup = get_irq_info(info->irq);
GList *proc_interrupts = (GList *) attr;
if (!lookup)
add_new_irq(info->irq, info, proc_interrupts);
}
void rebuild_irq_db(void)
{
DIR *devdir;
struct dirent *entry;
GList *tmp_irqs = NULL;
free_irq_db();
tmp_irqs = collect_full_irq_list();
devdir = opendir(SYSPCI_DIR);
if (devdir) {
do {
entry = readdir(devdir);
if (!entry)
break;
build_one_dev_entry(entry->d_name, tmp_irqs);
} while (entry != NULL);
closedir(devdir);
}
for_each_irq(tmp_irqs, add_missing_irq, interrupts_db);
g_list_free_full(tmp_irqs, free);
}
void for_each_irq(GList *list, void (*cb)(struct irq_info *info, void *data), void *data)
{
GList *entry = g_list_first(list ? list : interrupts_db);
GList *next;
while (entry) {
next = g_list_next(entry);
cb(entry->data, data);
entry = next;
}
}
struct irq_info *get_irq_info(int irq)
{
GList *entry;
struct irq_info find;
find.irq = irq;
entry = g_list_find_custom(interrupts_db, &find, compare_ints);
if (!entry)
entry = g_list_find_custom(banned_irqs, &find, compare_ints);
return entry ? entry->data : NULL;
}
void migrate_irq(GList **from, GList **to, struct irq_info *info)
{
GList *entry;
struct irq_info find, *tmp;
find.irq = info->irq;
entry = g_list_find_custom(*from, &find, compare_ints);
if (!entry)
return;
tmp = entry->data;
*from = g_list_delete_link(*from, entry);
*to = g_list_append(*to, tmp);
info->moved = 1;
}
static gint sort_irqs(gconstpointer A, gconstpointer B)
{
struct irq_info *a, *b;
a = (struct irq_info*)A;
b = (struct irq_info*)B;
if (a->class < b->class)
return 1;
if (a->class > b->class)
return -1;
if (a->load < b->load)
return 1;
if (a->load > b->load)
return -1;
if (a < b)
return 1;
return -1;
}
void sort_irq_list(GList **list)
{
*list = g_list_sort(*list, sort_irqs);
}