In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 18})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
TEST_PLATFORM = 'cscratch@cori-knl'

plot_metric = 'darshan_normalized_perf_by_max'

group_by = ['_test_platform', '_benchmark_id']

print("test_platform =", TEST_PLATFORM)
print("plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
print("date_start =", abcutils.sc18paper.DATE_START.isoformat())
print("date_end =", abcutils.sc18paper.DATE_END.isoformat())

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=3*365)

print("Short window will average over %s at a time" % SHORT_WINDOW)
print("Long window will average over %s at a time" % LONG_WINDOW)

## Baseline SMA vs Global Mean

In [None]:
def draw_fig5(filtered_df, plot_metric, benchmark_ids, labels, draw_region_bounds=0):
    """Draw Figure 5 from the Year in the Life paper

    Args:
        filtered_df (pandas.DataFrame): Feature vectors to be plotted
        plot_metric (str): Column name of dependent variable (usually performance)
        benchmark_ids (list): Benchmark ID strings for benchmarks/systems to draw
        labels (list): Logical names to be used to label each benchmark_ids' pane
        draw_region_bounds (int): Time in seconds below which region boundaries
            should not be drawn.  If zero, do not draw region boundaries.

    Returns:
        matplotlib.figure.Figure: Figure containing one or more axes with the
            requested plots
        """
    nrows = len(benchmark_ids)
    fig, axes = matplotlib.pyplot.subplots(nrows=nrows, ncols=1, figsize=(8, 3*nrows), sharex=True)

    YMAX = 1
    YSTEP = 0.2

    sma_intercepts_list = []

    for index, _benchmark_id in enumerate(benchmark_ids):
        _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))

        sma_short = abcutils.features.calculate_sma(_example_df,
                                                    '_datetime_start',
                                                    plot_metric,
                                                    window=SHORT_WINDOW)
        if LONG_WINDOW.days > 365*2:
            # use the global mean rather than rely on a sufficiently long window to calculate it--just to be safe!
            sma_long = pandas.Series(_example_df[plot_metric].mean(),
                                     index=sma_short.index)
        else:
            sma_long = abcutils.features.calculate_sma(_example_df,
                                                       '_datetime_start',
                                                       plot_metric,
                                                       window=LONG_WINDOW)

        sma_intercepts = abcutils.features.find_sma_intercepts(sma_short, sma_long, _example_df['_datetime_start'])
        sma_intercepts_list.append(sma_intercepts)

        if len(fig.axes) > 1:
            ax = axes[index]
        else:
            ax = axes
        abcutils.plot.sma_overlaps(dataframe=_example_df,
                                   plot_metric=plot_metric,
                                   short_window=SHORT_WINDOW,
                                   long_window=LONG_WINDOW,
                                   sma_intercepts=sma_intercepts,
                                   ax=ax,
                                   method='value')
        # Add intercepts
        if draw_region_bounds:
            intercept_list = set([])
            y_min, y_max = ax.get_ylim()
            last_x_val = None
            for row in sma_intercepts.itertuples():
                x_val = abcutils.core.pd2epoch(row[1])
                if last_x_val is not None:
                    # only draw intercepts that 
                    if (x_val - last_x_val) > draw_region_bounds:
                        intercept_list.add(last_x_val)
                        intercept_list.add(x_val)
                last_x_val = x_val
            for intercept_x in intercept_list:
                        ax.plot([intercept_x, intercept_x], [y_min, y_max], color='black', linestyle='--')

    for index, ax in enumerate(fig.axes):
        # Restyle the SMA lines
        ax.get_lines()[1].set_color('C2')
        if LONG_WINDOW.days > 365*2:
    #       ax.get_lines()[1].set_label("$SMA_\infty$")
            ax.get_lines()[1].set_label("Global mean")
            ax.get_lines()[0].set_label("SMA")
        else:
            ax.get_lines()[1].set_label("$SMA_{long}$")# % LONG_WINDOW.days)
            ax.get_lines()[0].set_label("$SMA_{short}$")# % SHORT_WINDOW.days)

        # Set x ticks
        abcutils.plot.fix_xticks_timeseries(ax,
                                            format="%b %Y",
                                            criteria=lambda x: x.day == 1 and x.month % 2 == 0)

        # Set pane label
        ax.set_title(labels[index],
                     x=0.025,
                     y=(0.025),
                     fontsize=matplotlib.rcParams.get('font.size'),
                     ha='left',
                     backgroundcolor='#FFFFFFDD')

        # Fix y labels and limits
        if nrows > 1:
            ax.set_ylabel("")
        y_min, y_max = ax.set_ylim(0, YMAX)
        if index == 0:
            ax.set_yticks(numpy.arange(0, YMAX + YSTEP, YSTEP))
        else:
            ax.set_yticks(numpy.arange(0, YMAX, YSTEP))

        # Recolor bars and remove patches
        for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
            if bar.get_width() == 86400:
    #           bar.set_alpha(1.0)
                pass
            else:
                # make bars taller to fit the new ylim
    #           if sum(bar.get_facecolor()[0:3]) == 0.0:
    #               bar.set_height(y_max)
                bar.set_visible(False)

    print("Showing", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
    print("Test platform:", TEST_PLATFORM)
    print("SMA window:", SHORT_WINDOW)
    fig.subplots_adjust(hspace=0.0, wspace=0.0)
    
    # Draw the legend
#   legend_handlers = [
#       matplotlib.lines.Line2D([0], [0], color='C1', lw=2),
#       matplotlib.lines.Line2D([0], [0], color='C2', lw=2),
#       (matplotlib.patches.Patch(facecolor='#91BCD9')),
#   ]
#   legend_labels = [
#       "${SMA}_{short}$",
#       "Global Average",
#       "Measurement",
#   ]
    fig.axes[-1].legend(
#       legend_handlers,
#       legend_labels,
#       labelspacing=0,
        loc='lower right', bbox_to_anchor=(1.01, -0.0))#4))

    if nrows > 1:
        fig.text(0.02, 0.5,
                 "Fraction Peak Performance",
                 verticalalignment='center',
                 horizontalalignment='center',
                 rotation='vertical',
                 fontsize=matplotlib.rcParams.get('font.size'))

    return fig

In [None]:
fig = draw_fig5(filtered_df=filtered_df,
                 plot_metric=plot_metric,
                 benchmark_ids=['hacc_io_write_fpp_write', 'hacc_io_read_fpp_read'],
                 labels=["(a) HACC write", "(b) HACC read"])

output_file = "figs/longterm-%s-hacc.pdf" % (TEST_PLATFORM.split('@', 1)[0])
fig.savefig(output_file, bbox_inches='tight')
print("Saved to", output_file)

In [None]:
fig = draw_fig5(filtered_df=filtered_df,
                plot_metric=plot_metric,
                benchmark_ids=['hacc_io_write_fpp_write'],
                labels=["HACC write on Cori"],
                draw_region_bounds=30*86400)
fig.set_size_inches(8, 4)
ax = fig.axes[0]
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

output_file = "figs/longterm-%s-hacc-write.pdf" % (TEST_PLATFORM.split('@', 1)[0])
fig.savefig(output_file, bbox_inches='tight')
print("Saved to", output_file)

## Difference between SMAs

In [None]:
for index, _benchmark_id in enumerate(['hacc_io_write_fpp_write', 'hacc_io_read_fpp_read']):
    _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))

    sma_short = abcutils.features.calculate_sma(_example_df,
                                                '_datetime_start',
                                                plot_metric,
                                                window=SHORT_WINDOW)
    if LONG_WINDOW.days > 365*2:
        # use the global mean rather than rely on a sufficiently long window to calculate it--just to be safe!
        sma_long = pandas.Series(_example_df[plot_metric].mean(),
                                 index=sma_short.index)
    else:
        sma_long = abcutils.features.calculate_sma(_example_df,
                                                   '_datetime_start',
                                                   plot_metric,
                                                   window=LONG_WINDOW)

    fig, ax = matplotlib.pyplot.subplots(figsize=(8,4))

    x = sma_short.index.map(lambda x: int(time.mktime(x.timetuple()) / 86400) * 86400)

    ax.grid()
    ax.plot(x, sma_short - sma_long)
    ax.set_ylabel("Fraction Peak Performance\n$SMA_{short} - SMA_{long}$")
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d, %Y")