In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
# TEST_PLATFORM = 'scratch2@edison'
# TEST_PLATFORM = 'cscratch@cori-knl'
# TEST_PLATFORM = 'cscratch@cori-haswell'
TEST_PLATFORM = 'mira-fs1@mira'

# BENCHMARK_ID = 'ior_fpp_write'
# BENCHMARK_ID = 'ior_fpp_read'
# BENCHMARK_ID = 'dbscan_read_shared_read'
# BENCHMARK_ID = 'vpicio_uni_shared_write'
# BENCHMARK_ID = 'ior_shared_write'
# BENCHMARK_ID = 'ior_shared_read'
# BENCHMARK_ID = 'hacc_io_read_fpp_read'
BENCHMARK_ID = 'hacc_io_write_fpp_write'

plot_metric = 'darshan_normalized_perf_by_max'
delta = datetime.timedelta(days=1).total_seconds()

example_df = filtered_df.groupby(by=['_test_platform', '_benchmark_id']).get_group((TEST_PLATFORM, BENCHMARK_ID)).copy()
example_df = filtered_df.groupby(by=['_test_platform']).get_group((TEST_PLATFORM)).copy()

print "test_platform =", TEST_PLATFORM
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_normalized_perf_by_max',
    'coverage_factor_bw',
#   'coverage_factor_nodehrs',
    'coverage_factor_opens',
    'coverage_factor_stats',
    'coverage_factor_ops',
    'fs_ave_mds_cpu',
#   'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
#   'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
#   'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Region-defined Correlation

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=48)

print "Short window will average over %s at a time" % SHORT_WINDOW
print "Long window will average over %s at a time" % LONG_WINDOW

## Calculate intercepts and centroids from SMAs

* **Intercepts** are the place where two SMAs cross each other
* **Performance regions** are the data bounded by two intercepts
* **Centroids** are the centermost data point in a performance region

With this nomenclature, it is possible to also define **centroid regions** which are bounded by two centroids.  These regions capture the transition between two performance regions.

In [None]:
sma_short = abcutils.features.calculate_sma(example_df, '_datetime_start', plot_metric, window=SHORT_WINDOW)
sma_long = pandas.Series([example_df[plot_metric].mean()] * len(sma_short), index=sma_short.index, name=sma_short.name)

In [None]:
sma_intercepts = abcutils.features.sma_intercepts(example_df,
                                                  plot_metric,
                                                  short_window=SHORT_WINDOW,
                                                  long_window=LONG_WINDOW,
                                                  min_width=SHORT_WINDOW)
sma_intercepts

In [None]:
sma_centroids = abcutils.features.sma_centroids(example_df,
                                                plot_metric,
                                                short_window=SHORT_WINDOW,
                                                long_window=LONG_WINDOW,
                                                min_width=None)
sma_centroids

### Visualize intercepts and centroids
Plot the SMAs, performance regions, and centroids as a visual sanity check.

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts,
                                method='value')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)

ymin, ymax = ax.get_ylim()
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black')
    
abcutils.plot.fix_xticks_timeseries(ax)

print """
This plot shows the SMA performance regions (grey) and centroids (red).
Transition regions fall between every consecutive red centroid but are
not shaded.
"""

### Filter centroids

We only care about centroids that show a significant swing in performance.

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_centroids,
                                method='value')
xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ymin, ymax = ax.set_ylim(0, 1)
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black')
    
abcutils.plot.fix_xticks_timeseries(ax)
print "Now we plot the centroid-defined regions"

In [None]:
cutoff = 0.10

regions = list(abcutils.features.intercepts_to_region(example_df, sma_centroids))

centroid_regions = [] 
for index, region in enumerate(regions):
    # index+1 should always be valid for sma_centroids since regions[0] has one fewer region than boundaries
    this_value = sma_centroids.iloc[index]['sma_short']
    next_value = sma_centroids.iloc[index+1]['sma_short']
    centroid_delta = next_value - this_value
    centroid_delta /= next_value
    if abs(centroid_delta) >= cutoff:
        centroid_regions.append(region)
        ax.plot(abcutils.core.pd2epoch(sma_centroids.iloc[index]['_datetime_start']), this_value, marker='o', color='red')
        ax.plot(abcutils.core.pd2epoch(sma_centroids.iloc[index+1]['_datetime_start']), next_value, marker='8', color='red')
    print "%s: %5s %7.4f %8.3f %5.3f" % (
        region.iloc[0]['_datetime_start'],
        abs(centroid_delta) >= cutoff,
        centroid_delta,
        this_value,
        next_value)

print "\nFlagged regions"
if not centroid_regions:
    print "  (none)"
else:
    for region in centroid_regions:
        print region.iloc[0]['_datetime_start'], 'to', region.iloc[-1]['_datetime_start']

In [None]:
ax.get_figure()

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts[0:1],
                                method='value')

## Plot regions
min_y, max_y = ax.set_ylim(0, 1)
regions_shown = 0
for region in centroid_regions:
    min_x = abcutils.core.pd2epoch(region.iloc[0]['_datetime_start'])
    max_x = abcutils.core.pd2epoch(region.iloc[-1]['_datetime_start'])
    ax.add_patch(matplotlib.patches.Rectangle(xy=(min_x, min_y),
                 width=(max_x - min_x),
                 height=(max_y - min_y),
                 facecolor='black',
                 linewidth=0,
                 alpha=0.10,
                 zorder=0))
    ax.plot([min_x, min_x], [min_y, max_y], color='red')
    ax.plot([max_x, max_x], [min_y, max_y], color='red')

abcutils.plot.fix_xticks_timeseries(ax)

In [None]:
pcutoff = 1.0e-5
results = {
    'region_start': [],
    'region_end': [],
    'region_start_index': [],
    'region_end_index': [],
    'metric': [],
    'coeff': [],
    'pvalue': [],
    'region_points': []
}
identified_regions = []

for region in centroid_regions:
    x = region[plot_metric].values
    base_nan_filter = numpy.isnan(x)
    title = "%s - %s (%d points)" % (
        region.iloc[0]['_datetime_start'],
        region.iloc[-1]['_datetime_start'],
        len(x[~base_nan_filter])
    )
    print "new region:", title
    identified = False
    for y_label in umami_rows: #example_df.columns: #umami_rows:
        if y_label == plot_metric:
            continue
        y = example_df.loc[region.index][y_label].values
        try:
            nan_filter = base_nan_filter | numpy.isnan(y)
        except TypeError:
            # non-numeric; pass
            continue
        this_x = x[~nan_filter]
        this_y = y[~nan_filter]
        if len(this_y) > 0:
            coeff, pval = scipy.stats.pearsonr(this_x, this_y)
            if pval < pcutoff and coeff < 0.9999:
                results['region_start'].append(region.iloc[0]['_datetime_start'])
                results['region_end'].append(region.iloc[-1]['_datetime_start'])
                results['region_start_index'].append(region.index[0])
                results['region_end_index'].append(region.index[-1])
                results['metric'].append(y_label)
                results['coeff'].append(coeff)
                results['pvalue'].append(pval)
                results['region_points'].append(len(x[~base_nan_filter]))
                fig, ax = matplotlib.pyplot.subplots()
                ax.scatter(this_x, this_y)
                ax.set_xlabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
                ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(y_label, y_label).replace(' (', '\n('))
                ax.grid()
                fit = scipy.stats.linregress(this_x, this_y)
                ax.set_xlim(ax.get_xlim())
                ax.plot(ax.get_xticks(),
                        [fit.slope * xi + fit.intercept for xi in ax.get_xticks()],
                        color='C2',
                       linewidth=3)
                print "    Fit for", y_label, title
                print "    Coefficient: %12.4f" % coeff
                print "    p-value:     %12.4e" % pval
                print "    Slope:       %12.4f" % fit.slope
                print "    Rval:        %12.4e" % fit.rvalue
                print
                identified = True

    # Keep track of regions that have known root causes
    if identified:
        identified_regions.append(region)
    print

results_df = pandas.DataFrame.from_dict(results)
results_df

## Plot identified region(s)

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts[0:1],
                                method='value')

ax.set_ylim(0, 1.0)

## Plot regions identified via correlation analysis
min_y, max_y = ax.get_ylim()
overall_min_x = None
overall_max_x = None
for region in identified_regions:
    min_x = abcutils.core.pd2epoch(region.iloc[0]['_datetime_start'])
    max_x = abcutils.core.pd2epoch(region.iloc[-1]['_datetime_start'])
    if not overall_min_x or overall_min_x > min_x:
        overall_min_x = min_x
    if not overall_max_x or overall_max_x > max_x:
        overall_max_x = max_x
    ax.add_patch(matplotlib.patches.Rectangle(xy=(min_x, min_y),
                 width=(max_x - min_x),
                 height=(max_y - min_y),
                 facecolor='red',
                 linewidth=0,
                 alpha=0.10,
                 zorder=0))

ax.set_xlim(overall_min_x - (overall_max_x - overall_min_x)*2,
            overall_max_x + (overall_max_x - overall_min_x)*1)
ax.get_figure().set_size_inches(6, 4)
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

abcutils.plot.fix_xticks_timeseries(ax, criteria=(lambda x: x.day == 1))

## Plot for paper

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 6))

region = results_df.iloc[0]
region_df = example_df.loc[region['region_start_index']:region['region_end_index']]

# Plot #1
ax = axes[0]
ax.set_title("(a)", x=0.07, y=0.05)

abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts[0:1],
                                method='value',
                                ax=ax)
ax.set_ylim(0, 1)
min_y, max_y = ax.get_ylim()
overall_min_x = None
overall_max_x = None
min_x = abcutils.core.pd2epoch(region_df.iloc[0]['_datetime_start'])
max_x = abcutils.core.pd2epoch(region_df.iloc[-1]['_datetime_start'])
ax.add_patch(matplotlib.patches.Rectangle(xy=(min_x, min_y),
             width=(max_x - min_x),
             height=(max_y - min_y),
             facecolor='red',
             linewidth=0,
             alpha=0.10,
             zorder=0))
ax.set_xlim(min_x - (max_x - min_x)*2,
            max_x + (max_x - min_x)*1)
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

ax.get_lines()[0].set_label("$SMA_{%d}$" % SHORT_WINDOW.days)
ax.get_lines()[1].set_label("$SMA_{%d}$" % LONG_WINDOW.days)
ax.legend(loc='lower right')

abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", rotation=0, ha='center')

# Plot #2
ax = axes[1]
ax.set_title("(b)", x=0.07, y=0.8)

xval = region_df[plot_metric]
yval = region_df[region['metric']]
nan_filter = numpy.isnan(xval) | numpy.isnan(yval)
xval = xval[~nan_filter].values
yval = yval[~nan_filter].values

ax.scatter(xval, yval, color='C0', alpha=0.75)
ax.set_xlabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(region['metric'], region['metric']).replace(' (', '\n('))
ax.grid()
ax.set_ylim(0, 1)
ax.set_yticks([0, .25, .5, .75, 1.])
fit = scipy.stats.linregress(xval, yval)
ax.set_xlim(ax.get_xlim())
ax.plot(ax.get_xticks(),
        [fit.slope * xi + fit.intercept for xi in ax.get_xticks()],
        color='black',
        linestyle='--',
        linewidth=2)