In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import collections
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

numpy.random.seed(int(time.mktime(datetime.datetime.now().timetuple())))

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
plot_metric = 'darshan_normalized_perf_by_max'

group_by = ['_test_platform', '_benchmark_id']

print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
# Width of simple moving average (SMA) short/long windows
short_window = pandas.Timedelta(days=14)
long_window = pandas.Timedelta(days=49)

print "Short window will average over %s measurements at a time" % short_window
print "Long window will average over %s measurements at a time" % long_window

## Classification

### Generate or load calculated contributors

This can take an inconvenient amount of time, so we cache the results to `contributors.hdf`

In [None]:
all_contributors = None

try:
    all_contributors = pandas.read_hdf('contributors.hdf5', 'contributors')
except IOError:
    pass

if all_contributors is None:
    grouped_df = filtered_df.groupby(by=group_by)
    for group in grouped_df.groups:
        example_df = grouped_df.get_group(group)

        intercepts = abcutils.features.sma_intercepts(example_df,
                                                      plot_metric,
                                                      short_window=short_window,
                                                      long_window=long_window)

        loci = abcutils.features.generate_loci_sma(example_df,
                                                   plot_metric,
                                                   mins=True,
                                                   maxes=False,
                                                   short_window=short_window,
                                                   long_window=long_window)
        regions = list(abcutils.features.intercepts_to_region(example_df, intercepts))

        for region in regions:
            contributors = abcutils.classify.identify_contributors(region=region,
                                                    target_column=plot_metric,
                                                    target_index=region[plot_metric].idxmin(),
                                                    correlate_columns=abcutils.CONFIG['umami_row_order'],
                                                    want_good=False,
                                                    classifier='minmax')
            if all_contributors is None:
                all_contributors = contributors
            else:
                all_contributors = pandas.concat((all_contributors, contributors))

    all_contributors.index = numpy.arange(len(all_contributors))
    all_contributors.to_hdf('contributors.hdf5', key='contributors', mode='w', format='fixed', complevel=9, complib='zlib')

In [None]:
print "Number of target indices:", len(all_contributors['target_index'].unique())

Apply filters to remove very high p-value measurements from the dataset.  These cause problems when performing significance testing later on, since they dilute the significance of the results.

In [None]:
# Drop any contributors who lack statistical confidence by virtue of their domain
#significant_contributors = all_contributors[all_contributors['random_pvalue'] < 0.10]

# Also drop any contributors who lack statistical confidence because of duplicate values
significant_contributors = all_contributors[all_contributors['pvalue'] < 0.10]

# Keep all data and let p-values speak for themselves
#significant_contributors = all_contributors

print "Discarding %d contributors with p-values < 0.10" % (len(all_contributors) - len(significant_contributors))

print "Number of contributors remaining:", len(significant_contributors)
print "Number of target indices ('bad' jobs):", len(significant_contributors['target_index'].unique())
print "Number of unclassified jobs:", (all_contributors.groupby(['target_index']).sum()['target_metric_matches'] < 1.0).sum()

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8,4))

print "Average fraction of matches per metric"
significant_contributors[['_test_platform', 'target_metric_matches', 'metric_name']]\
    .groupby(['metric_name'])\
    .mean()\
    .plot(kind='bar', ax=ax)
ax.get_legend().set_visible(False)
ax.set_ylabel("Fraction of bad jobs")
ax.set_xlabel("")
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.yaxis.grid(True)
ax.set_axisbelow(True)
pass

In [None]:
tagged_metric_counts = pandas.pivot_table(significant_contributors,
                                          values='target_metric_matches',
                                          index=['metric_name'],
                                          columns=['_test_platform'],
                                          aggfunc=numpy.sum).fillna(0.0)
tagged_metric_counts

In [None]:
metric_observation_counts = pandas.pivot_table(significant_contributors,
                                               values='target_metric_matches',
                                               index=['metric_name'],
                                               columns=['_test_platform'],
                                               aggfunc=lambda x: (~numpy.isnan(x)).sum()).fillna(0.0)
metric_observation_counts

In [None]:
# divide the number of contributors by the number of times that contributor was
# ever observed to get its contribution to the overall fraction of regions where
# that metric was implicated
contributor_distribution = tagged_metric_counts.div(metric_observation_counts.sum(axis=1), axis=0)
contributor_distribution = contributor_distribution.reindex(index=contributor_distribution.sum(axis=1).sort_values(ascending=False).index)
try:
    del contributor_distribution.columns.name
except AttributeError:
    pass
contributor_distribution

In [None]:
def draw_stacked_bars(contributor_distribution, legendprops=None):
    _legendprops = {}
    if legendprops:
        _legendprops.update(legendprops)
    
    row_sums = contributor_distribution.sum(axis=1)

    fig, ax = matplotlib.pyplot.subplots(figsize=(8,4))

    contributor_distribution.plot.bar(stacked=True, ax=ax, width=0.90)
    ax.grid()
    ax.set_ylim(0, 0.5)
    ax.set_axisbelow(True)

    xticklabels = [abcutils.CONFIG['umami_rows'].get(x.get_text()) for x in ax.get_xticklabels()]
    ax.set_xticklabels(xticklabels, rotation=30, ha='right')
    ax.set_xlabel("")
    ax.set_ylabel("Fraction of tests")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, [abcutils.CONFIG['platform_labels'].get(x, x) for x in labels], **_legendprops)

    for index, x_value in enumerate(ax.get_xticks()):
        ax.annotate("%d%%" % (100.0 * row_sums[index]), xy=(x_value, row_sums[index] + 0.02),
                    ha='center',
                    backgroundcolor='#FFFFFFAA')

    return ax

draw_stacked_bars(contributor_distribution)

## Assert confidence

We use the binomial test to calculate the p-values of each fraction of tests asserted above.

In [None]:
results = []
for metric in contributor_distribution.index.values: # ['fs_ave_oss_cpu']: #
    prob_success = 1.0
    prob_failure = 1.0

    successes = []
    failures = []

    num_matches = 0
    metric_measures = significant_contributors[significant_contributors['metric_name'] == metric]
    
    for row in metric_measures.itertuples():
        if row.target_metric_matches:
            num_matches += 1
            prob_success *= row.pvalue
            successes.append(row.pvalue)
        else:
            prob_failure *= (1.0 - row.pvalue)
            failures.append(row.pvalue)

    pick_n = num_matches
    out_of = len(metric_measures)
    
    if not successes:
        continue

    # what is the probability that we observe pick_n / out_of jobs with this
    # tagged metric given the probability of encountering a tagged metric
    # if there's no relationship between this metric being tagged and each
    # job's performance?
    #
    # binomial test: assume the null hypothesis is TRUE
    #   1. pick the highest p-value observed for this metric - that is the
    #      case where the null hypothesis is most likely to be true
    #   2. perform the binomial test to see what the odds are of observing
    #      pick_n **or more** tagged metrics if the null hypothesis is true?
    probability = numpy.max(successes)
    pvalue = scipy.stats.binom_test(pick_n,
                                    out_of,
                                    probability,
                                    alternative='greater')
    
    result = collections.OrderedDict({})
    result['metric'] = metric
    result['pick_n'] = pick_n
    result['out_of'] = out_of
    result['probability_used'] = probability
    result['calculated_pvalue'] = pvalue

    results.append(result)

binomial_results = pandas.DataFrame.from_dict(results).set_index('metric')
binomial_results

In [None]:
ax = draw_stacked_bars(contributor_distribution.loc[binomial_results.index])

Shade off the statistically insignificant metrics

In [None]:
# build a mapping from metrics to rectangles
reverse_metric_map = {}
for key, val in abcutils.CONFIG['umami_rows'].iteritems():
    reverse_metric_map[val] = key

rectangle_map = {}
xticks = ax.get_xticks()
xticklabels = [x.get_text() for x in ax.get_xticklabels()]
for child in ax.get_children():
    if isinstance(child, matplotlib.patches.Rectangle) and child.get_width() == 0.9:
        child_x = int(round(child.xy[0] + child.get_width() / 2))
        key = reverse_metric_map[xticklabels[child_x]]
        if key not in rectangle_map:
            rectangle_map[key] = []
        rectangle_map[key].append(child)

In [None]:
for row in binomial_results.itertuples():
    if row.calculated_pvalue > 0.10:
        for rectangle in rectangle_map[row.Index]:
            rectangle.set_color("#DDDDDD")
            rectangle.set_edgecolor('#DDDDDD')

In [None]:
ax.xaxis.grid(False)
ax.get_figure().savefig('figs/contributors-bad-by-system-grey.pdf', bbox_inches='tight')
ax.get_figure()

In [None]:
for row in binomial_results.itertuples():
    if row.calculated_pvalue > 0.10:
        for rectangle in rectangle_map[row.Index]:
            rectangle.set_color("#DDDDDD")
            rectangle.set_edgecolor('#DDDDDD')
    else:
        for rectangle in rectangle_map[row.Index]:
            rectangle.set_color("C0")
            rectangle.set_edgecolor('C0')
ax.get_legend().set_visible(False)
ax.get_figure().savefig('figs/contributors-bad-grey.pdf', bbox_inches='tight')
ax.get_figure()

Trim off all metrics which are not statistically significant

In [None]:
ax = draw_stacked_bars(contributor_distribution.loc[(binomial_results['calculated_pvalue'] < 0.10).values],
                      legendprops={
                          'loc': 'upper right',
                          'bbox_to_anchor': (1.01, 1.03),
                          'labelspacing': 0.4
                      })
ax.set_xticklabels([x.get_text().replace(' CF', '\nCoverage Factor') for x in ax.get_xticklabels()], rotation=30)
ax.xaxis.grid(False)
ax.get_figure().savefig('figs/contributors-bad-by-system.pdf', bbox_inches='tight')

In [None]:
binomial_results.sort_values('calculated_pvalue')