In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
# TEST_PLATFORM = 'scratch2@edison'
TEST_PLATFORM = 'cscratch@cori-knl'
#TEST_PLATFORM = 'cscratch@cori-haswell'
#TEST_PLATFORM = 'mira-fs1@mira'

# BENCHMARK_ID = 'ior_fpp_write'
# BENCHMARK_ID = 'dbscan_read_shared_read'
# BENCHMARK_ID = 'vpicio_uni_shared_write'
# BENCHMARK_ID = 'ior_shared_write'
BENCHMARK_ID = 'hacc_io_read_fpp_read'
# BENCHMARK_ID = 'hacc_io_write_fpp_write'

plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
delta = datetime.timedelta(days=1).total_seconds()

group_by = ['_test_platform', '_benchmark_id']

example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, BENCHMARK_ID)).copy()

print "test_platform =", TEST_PLATFORM
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
# Width of simple moving average (SMA) short/long windows
abcutils.features.SHORT_WINDOW = pandas.Timedelta(days=7)
abcutils.features.LONG_WINDOW = pandas.Timedelta(days=48)

short_window = abcutils.features.SHORT_WINDOW
long_window = abcutils.features.LONG_WINDOW

print "Short window will average over %s at a time" % short_window
print "Long window will average over %s at a time" % long_window

## Calculate Simple Moving Averages (SMAs)

Compare a short-window SMA and a long-window SMA and use the places where they cross over to divide the entire year into _regions_ of interesting benchmark behavior.

In [None]:
loci = abcutils.features.generate_loci_sma(example_df, plot_metric, mins=True, maxes=False)
ax = abcutils.plot.locus_summary(example_df, plot_metric, loci)
ax.get_figure().suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID), TEST_PLATFORM))

## Calculate regions with a minimum size

In [None]:
SHORT_WINDOW = pandas.Timedelta(days=7)
LONG_WINDOW = pandas.Timedelta(days=28)

In [None]:
sma_intercepts = abcutils.features.sma_intercepts(example_df,
                                                  plot_metric,
                                                  short_window=SHORT_WINDOW,
                                                  long_window=LONG_WINDOW,
                                                  min_width=SHORT_WINDOW)
sma_intercepts

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts,
                                method='value')

global_mean = example_df[plot_metric].mean()
xmin, xmax = ax.get_xlim()
#ax.plot([xmin, xmax], [global_mean, global_mean], color='C4')
ax.set_xlim(xmin, xmax)

Count the number of regions in each dataset.  This takes a while!

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'mira-fs1@mira'
]

BENCHMARK_IDS = filtered_df['_benchmark_id'].unique()

#for _test_platform in TEST_PLATFORMS:
#    for _benchmark_id in BENCHMARK_IDS:
#        example_df = filtered_df.groupby(by=group_by).get_group((_test_platform, _benchmark_id))
#        sma_intercepts = abcutils.features.sma_intercepts(example_df,
#                                                      plot_metric,
#                                                      short_window=SHORT_WINDOW,
#                                                      long_window=LONG_WINDOW,
#                                                      min_width=SHORT_WINDOW)
#        print _test_platform, _benchmark_id, len(sma_intercepts.index)

## Baseline SMA vs Global Mean

In [None]:
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=3*365)
#LONG_WINDOW = pandas.Timedelta(days=56)

print "Using a short window of", SHORT_WINDOW

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True)
#gridspec = matplotlib.gridspec.GridSpec(2, 1)

YMAX = 500
YSTEP = 100

sma_intercepts_list = []

labels = ["(a) HACC write", "(b) HACC read"]
for index, _benchmark_id in enumerate(['hacc_io_write_fpp_write', 'hacc_io_read_fpp_read']):
    _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))
    
    sma_short = abcutils.features.calculate_sma(_example_df,
                                                '_datetime_start',
                                                plot_metric,
                                                window=SHORT_WINDOW)
#   sma_long = abcutils.features.calculate_sma(_example_df,
#                                              '_datetime_start',
#                                              plot_metric,
#                                              window=LONG_WINDOW)

    # use the global mean rather than rely on a sufficiently long window to calculate it--just to be safe!
    sma_long = pandas.Series(_example_df[plot_metric].mean(), index=sma_short.index)
    
    sma_intercepts = abcutils.features.find_sma_intercepts(sma_short, sma_long, _example_df['_datetime_start'])
    sma_intercepts_list.append(sma_intercepts)
    
    ax = axes[index]
    abcutils.plot.sma_overlaps(dataframe=_example_df,
                               plot_metric=plot_metric,
                               short_window=SHORT_WINDOW,
                               long_window=LONG_WINDOW,
                               sma_intercepts=sma_intercepts,
                               ax=ax,
                               method='value')
    # Add intercepts
#   y_min, y_max = ax.get_ylim()
#   for row in sma_intercepts.itertuples():
#       x_val = abcutils.core.pd2epoch(row[1])
#       ax.plot([x_val, x_val], [y_min, y_max], color='black', linestyle='--')
#       print "intercept in", index

for index, ax in enumerate(fig.axes):
    # Restyle the SMA lines
    ax.get_lines()[1].set_color('C2')
    if LONG_WINDOW.days > 365*2:
#       ax.get_lines()[1].set_label("$SMA_\infty$")
        ax.get_lines()[1].set_label("Global mean")
    else:
        ax.get_lines()[1].set_label("$SMA_{%d}$" % LONG_WINDOW.days)
    ax.get_lines()[0].set_label("$SMA_{%d}$" % SHORT_WINDOW.days)
    
    # Set x ticks
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d, %Y")

    # Set pane label
    ax.set_title(labels[index], x=0.025, y=0.825, fontsize=16, ha='left')

    # Fix y labels and limits
    ax.set_ylabel("")
    y_min, y_max = ax.set_ylim(0, YMAX)
    if index == 0:
        ax.set_yticks(range(0, YMAX + YSTEP, YSTEP))
    else:
        ax.set_yticks(range(0, YMAX, YSTEP))
        
    # Recolor bars and remove patches
    for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
        if bar.get_width() == 86400:
#           bar.set_alpha(1.0)
            pass
        else:
            # make bars taller to fit the new ylim
#           if sum(bar.get_facecolor()[0:3]) == 0.0:
#               bar.set_height(y_max)
            bar.set_visible(False)
    
print "Showing", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "Test platform:", TEST_PLATFORM
print "Benchmark:", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "Mean performance:", global_mean
print "SMA window:", SHORT_WINDOW
output_file = "figs/longterm-%s-%s.pdf" % (TEST_PLATFORM.split('@', 1)[0], BENCHMARK_ID.split('_', 1)[0])
fig.subplots_adjust(hspace=0.0, wspace=0.0)
axes[1].legend(loc='upper right')#, bbox_to_anchor=(1.00, 0.0))


fig.text(0.02, 0.5,
         "Performance (GiB/s)",
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=16)

fig.savefig(output_file, bbox_inches='tight')
print "Saved to", output_file

Plot a histogram of the region widths for fun.

In [None]:
fig, ax = matplotlib.pyplot.subplots()
for intindex, sma_intercepts in enumerate(sma_intercepts_list):
    vals = []
    for index in range(1, len(sma_intercepts)):
        delta = sma_intercepts.iloc[index]['_datetime_start'] - sma_intercepts.iloc[index - 1]['_datetime_start']
#       print delta
        vals.append(delta.total_seconds() / 86400)
    ax.hist(vals, alpha=0.5, linewidth=2, edgecolor='black', label=labels[intindex])
    print "Sorted widths (days):", sorted([round(x) for x in vals])
    print "# values:", len(vals)
    print "avg width:", numpy.mean(vals)
    print "median:   ", numpy.median(vals)

    #ax.set_xscale("log")
ax.grid()
ax.legend()
ax.set_ylabel("Number of regions")
ax.set_xlabel("Region width (days)")

## Difference between SMAs

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8,4))

ax.grid()
ax.plot(sma_short - sma_long)