In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
CACHE_FILE = 'cache.hdf5'
if CACHE_FILE and os.path.isfile(CACHE_FILE):
    print "Loading from cache %s" % CACHE_FILE
    df = pandas.read_hdf(CACHE_FILE, 'summary')
else:
    df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-28.csv', system='edison'),
                        abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2018-02-28.csv', system='cori'),
                        abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                       axis='rows')
    if CACHE_FILE:
        df.to_hdf(CACHE_FILE, key='summary', mode='w', format='fixed', complevel=9, complib='zlib')
        print "Cached synthesized CSV to %s" % CACHE_FILE
    
# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

In [None]:
pd2epoch = lambda x: time.mktime(x.to_pydatetime().timetuple())

In [None]:
group_by = ['_test_platform', '_benchmark_id']

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
# test_platform = 'scratch2@edison'
TEST_PLATFORM = 'cscratch@cori-knl'
#test_platform = 'cscratch@cori-haswell'
# test_platform = 'mira-fs1@mira'

BENCHMARK_ID = 'ior_fpp_write'
# benchmark_id = 'dbscan_read_shared_read'
# benchmark_id = 'vpicio_uni_shared_write'
# benchmark_id = 'ior_shared_write'
# benchmark_id = 'hacc_io_read_fpp_read'

plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)

group_by = ['_test_platform', '_benchmark_id']
delta = datetime.timedelta(days=1).total_seconds()

filtered_df = df[df['darshan_total_gibs_posix'] > 1.0]
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]
filtered_df = filtered_df[filtered_df['_benchmark_id'] != 'hacc_io_write_shared_write']

example_df = df.groupby(by=group_by).get_group((TEST_PLATFORM, BENCHMARK_ID))

print "test_platform =", TEST_PLATFORM
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", date_start.isoformat()
print "date_end =", date_end.isoformat()

In [None]:
# Width of simple moving average (SMA) short/long windows
short_window = abcutils.features.SHORT_WINDOW
long_window = abcutils.features.LONG_WINDOW

print "Short window will average over %d measurements at a time" % short_window
print "Long window will average over %d measurements at a time" % long_window

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_agg_perf_by_slowest_posix_gibs',
    'coverage_factor_bw',
    'coverage_factor_nodehrs',
    'fs_ave_mds_cpu',
    'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
    'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
    'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Calculate Simple Moving Averages (SMAs)

Compare a short-window SMA and a long-window SMA and use the places where they cross over to divide the entire year into _regions_ of interesting benchmark behavior.

For each region defined above, find the _minimum_ performance observed and denote that measurement (and its associated job) as a _locus_.  We then collate all _loci_ into a set of poorly performing benchmarks that are worth contextualizing with UMAMI.

We also plot the raw performance data (light blue bars), the short SMA (orange line), the long SMA (green line), and all loci (red bars) to visually verify that the loci we've identified are indeed poorly performing jobs.

In [None]:
loci = abcutils.features.generate_loci_sma(example_df, plot_metric, mins=True, maxes=False)
ax = abcutils.plot.locus_summary(example_df, plot_metric, loci)
ax.get_figure().suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID), TEST_PLATFORM))

## Generate UMAMI Diagrams Around Loci

Generate UMAMI diagrams that _end_ at each locus and have `long_window` days' of benchmark data preceding them.  Don't bother creating UMAMI diagrams for benchmarks with fewer than `short_window` benchmark data in the preceding `long_window` days.

Note that this process mixes up the semantic meaning of `long_window`.  When defining loci, `long_window` refers to a number of benchmark measurements, not days.  Ideally, one benchmark runs each day so this semantic difference is trivial.  However in reality, there are days when no benchmarks are run meaning loci are defined using a series of `long_window` benchmark measurements that often span _more than_ `long_window` days.

Practically speaking, this does not change very much as long as the ratio of `long_window` in days to `long_window` in benchmark measurements is close to 1.0.

In [None]:
max_renders = 1

print "Rendering a maximum of %d UMAMI diagrams" % max_renders

rendered = 0
for locus in loci.itertuples():
    region_idx0 = example_df.index.get_loc(locus.region_start)
    region_idxf = example_df.index.get_loc(locus.region_end)
    umami_region = example_df.iloc[region_idx0:region_idxf]

    if len(umami_region) >= short_window:
        abcutils.plot.generate_umami(umami_region, umami_rows, highlight_index=umami_region.index.get_loc(locus.Index))
        rendered += 1
        if rendered == max_renders:
            break
    else:
        print "Skipping locus at %s because it has only %d data points (%d required)" % (umami_region['_datetime_start'], len(umami_region), short_window)

## Tabulate frequency of different problems

Because we define loci to be local minima (i.e., the worst benchmark measured in a temporally local region), all of the UMAMIs we generated above _should_ end on an extremely bad day.  To automatically identify the possible causes for bad performance at each locus, we look at all of the UMAMI metrics and flag those that also ended on extremely poor (e.g., worst quartile) values.  This is exactly the same process we used in the PDSW'17 paper's case studies, but now we have automated the process.

With this method of flagging, we keep a running total of metrics that were flagged as possible culprits as we examine each locus.  Note that multiple metrics can be flagged for a single locus (e.g., low coverage factor _and_ high MDS load can both be flagged for a single benchmark run), so the sum of flags over all metrics will usually add up to more than the total number of loci.

In [None]:
def print_extreme_measurements(results):
    print "Classified: %d" % results['totals'].get('classified', -0)
    print "Unclassified: %d" % results['totals'].get('unclassified', -0)
    print "Ignored: %d" % results['totals'].get('ignored', -0)
    print "Errors: %d" % results['totals'].get('errors', -0)
    print
    for key in reversed(sorted(results['per_metric'].keys(), key=lambda x: results['per_metric'][x])):
        print "%3d %s" % (results['per_metric'][key], abcutils.CONFIG['metric_labels'].get(key, key))

In [None]:
classified_extremes = abcutils.classify.classify_extreme_measurements(example_df, plot_metric, umami_rows, want_good=False)

print_extreme_measurements(classified_extremes)

## Tabulate contributors to bad performance over all tests

We now apply the above analysis to the entirety of the benchmark data across all systems.

Note that warnings about certain loci not being in the worst quartile indicate that the SMA-based method we use to identify local minima is not perfect.  There are a variety of other methods (including some canned algorithms) that we can swap in to improve our classification of loci.

In [None]:
classified_extremes = abcutils.classify.classify_extreme_measurements(filtered_df, plot_metric, umami_rows, want_good=False)

print_extreme_measurements(classified_extremes)

The following bar graph shows the total number of times each metric has been flagged as a possible source of performance loss as defined above: its value was "bad" coincident with each locus, where a locus is a job whose performance was abnormally poor and "bad" is defined as being within the 25th worst percentile.

In [None]:
ax = abcutils.plot.classified_extremes_summary(classified_extremes)
ax.get_figure().set_size_inches(12, 4)
ax.set_ylim(0, 50)
ax.set_title("Candidate Contributors to Bad Performance (%d Jobs Total)" % classified_extremes['totals']['total'])

In [None]:
ax = abcutils.plot.classified_extremes_summary_grouped(classified_extremes, group_metric='_benchmark_id')
ax.get_figure().set_size_inches(12, 4)
ax.set_ylim(0, 50)
ax.set_title("Candidate Contributors to Bad Performance (%d Jobs Total)" % classified_extremes['totals']['total'])

In [None]:
ax = abcutils.plot.classified_extremes_summary_grouped(classified_extremes, group_metric='_test_platform')
ax.get_figure().set_size_inches(12, 4)
ax.legend(bbox_to_anchor=(1.38, 1.0))
ax.set_ylim(0, 50)
ax.set_title("Candidate Contributors to Bad Performance (%d Jobs Total)" % classified_extremes['totals']['total'])

There are many caveats with the above plot; notably, the majority of jobs were run on Lustre since this data includes all Edison, Cori+KNL, and Cori+Haswell jobs.  In addition, the Mira data does not currently contain file system health data (although it is available), so the "Number of Overloaded OSSes" may be underreported.

The most appropriate way to present this data is to produce one bar graph per test platform (compute system + file system combination) so that metrics that are only available on one test platform aren't being directly compared with others that are.

## Tabulate contributors to good performance over all tests

We can also perform the above analysis and look for the metrics that coincided with good performance.

In [None]:
loci = abcutils.features.generate_loci_sma(example_df, plot_metric, mins=False, maxes=True)
ax = abcutils.plot.locus_summary(example_df, plot_metric, loci)
ax.get_figure().suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID), TEST_PLATFORM))

In [None]:
classified_extremes = abcutils.classify.classify_extreme_measurements(filtered_df, plot_metric, umami_rows, want_good=True)
print_extreme_measurements(classified_extremes)

In [None]:
ax = abcutils.plot.classified_extremes_summary(classified_extremes)
ax.set_title("Candidate Contributors to Good Performance (%d Jobs Total)" % classified_extremes['totals']['total'])

In [None]:
ax = abcutils.plot.classified_extremes_summary_grouped(classified_extremes, group_metric='_benchmark_id')
ax.get_figure().set_size_inches(16, 6)
ax.set_title("Candidate Contributors to Good Performance (%d Jobs Total)" % classified_extremes['totals']['total'])

## Normalize contributors

Not every test had every metric populated (e.g., Mira lacks some of the telemetry of Edison and vice versa).  As such, we should normalize the frequency of different contributors to the total number of times that contributor _could have_ contributed but did not.

In [None]:
loci = abcutils.features.generate_loci_sma(example_df, plot_metric, mins=True, maxes=False)

for locus in loci.itertuples():
    region_idx0 = example_df.index.get_loc(locus.region_start)
    region_idxf = example_df.index.get_loc(locus.region_end)
    region_df = example_df.iloc[region_idx0:region_idxf][['_datetime_start'] + umami_rows]
    
abcutils.classify.identify_contributors(region_df,
                                        plot_metric,
                                        want_good=False,
                                        minima_iloc=region_df.index.get_loc(locus.Index))

In [None]:
classified_extremes = abcutils.classify.classify_extreme_measurements(filtered_df, plot_metric, umami_rows, want_good=True)
print_extreme_measurements(classified_extremes)