# Performance Variation over Time

This notebook contains a series of plots designed to demonstrate the performance variation over long periods of time.  This analysis is the same method used to generate Figure 5 in the SC'18 paper, but this notebook is a little more flexible in its ingestion of datasets.

In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 18})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
dataframe = abcutils.sc18paper.load_raw_datasets(
    input_datasets={
        'cori': 'summaries/cori-summaries_2017-02-14-2019-03-31.csv'
    })

filtered_df = abcutils.sc18paper.clean_sc18_dataframe(dataframe)

## Define Input Parameters

Look at one combination of (compute system, file system) to show what this UMAMI analysis can do.

In [None]:
TEST_PLATFORM = 'cscratch@cori-knl'

plot_metric = 'darshan_normalized_perf_by_max'

group_by = ['_test_platform', '_benchmark_id']

print("test_platform =", TEST_PLATFORM)
print("plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
print("date_start =", abcutils.sc18paper.DATE_START.isoformat())
print("date_end =", abcutils.sc18paper.DATE_END.isoformat())

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = (filtered_df['_datetime_start'].max() - filtered_df['_datetime_start'].min()) * 3

print("Short window will average over %s at a time" % SHORT_WINDOW)
print("Long window will average over %s at a time" % LONG_WINDOW)

## Baseline SMA vs Global Mean

In [None]:
def draw_fig5(filtered_df, plot_metric, benchmark_ids, labels, draw_region_bounds=0):
    """Draw Figure 5 from the Year in the Life paper

    Args:
        filtered_df (pandas.DataFrame): Feature vectors to be plotted
        plot_metric (str): Column name of dependent variable (usually performance)
        benchmark_ids (list): Benchmark ID strings for benchmarks/systems to draw
        labels (list): Logical names to be used to label each benchmark_ids' pane
        draw_region_bounds (int): Time in seconds below which region boundaries
            should not be drawn.  If zero, do not draw region boundaries.

    Returns:
        matplotlib.figure.Figure: Figure containing one or more axes with the
            requested plots
        """
    nrows = len(benchmark_ids)
    fig, axes = matplotlib.pyplot.subplots(nrows=nrows, ncols=1, figsize=(8, 3*nrows), sharex=True)

    sma_intercepts_list = []
    abs_ymax = None

    for index, _benchmark_id in enumerate(benchmark_ids):
        _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))

        sma_short = abcutils.features.calculate_sma(_example_df,
                                                    '_datetime_start',
                                                    plot_metric,
                                                    window=SHORT_WINDOW)
        if LONG_WINDOW.days > 365*2:
            # use the global mean rather than rely on a sufficiently long window to calculate it--just to be safe!
            sma_long = pandas.Series(_example_df[plot_metric].mean(),
                                     index=sma_short.index)
        else:
            sma_long = abcutils.features.calculate_sma(_example_df,
                                                       '_datetime_start',
                                                       plot_metric,
                                                       window=LONG_WINDOW)

        sma_intercepts = abcutils.features.find_sma_intercepts(sma_short, sma_long, _example_df['_datetime_start'])
        sma_intercepts_list.append(sma_intercepts)

        if len(fig.axes) > 1:
            ax = axes[index]
        else:
            ax = axes
        abcutils.plot.sma_overlaps(dataframe=_example_df,
                                   plot_metric=plot_metric,
                                   short_window=SHORT_WINDOW,
                                   long_window=LONG_WINDOW,
                                   sma_intercepts=sma_intercepts,
                                   ax=ax,
                                   raw_data_interval=86400)
        # Add intercepts
        y_min, y_max = ax.get_ylim()
        if abs_ymax is None:
            abs_ymax = y_max
        else:
            abs_ymax = max(abs_ymax, y_max)

        if draw_region_bounds:
            intercept_list = set([])
            last_x_val = None
            for row in sma_intercepts.itertuples():
                x_val = abcutils.core.pd2epoch(row[1])
                if last_x_val is not None:
                    # only draw intercepts that 
                    if (x_val - last_x_val) > draw_region_bounds:
                        intercept_list.add(last_x_val)
                        intercept_list.add(x_val)
                last_x_val = x_val
            for intercept_x in intercept_list:
                        ax.plot([intercept_x, intercept_x], [y_min, y_max], color='black', linestyle='--')

    for index, ax in enumerate(fig.axes):
        # Restyle the SMA lines
        ax.get_lines()[1].set_color('C2')
        if LONG_WINDOW.days > 365*2:
            ax.get_lines()[1].set_label("$SMA_\infty$")
            ax.get_lines()[0].set_label("SMA")
        else:
            ax.get_lines()[1].set_label("$SMA_{long}$")# % LONG_WINDOW.days)
            ax.get_lines()[0].set_label("$SMA_{short}$")# % SHORT_WINDOW.days)

        # Set x ticks
        abcutils.plot.fix_xticks_timeseries(ax,
                                            format="%b %Y",
                                            criteria=lambda x: x.day == 1 and x.month % 2 == 0)

        # Set pane label
        ax.set_title(labels[index],
                     x=0.025,
                     y=(0.025),
                     fontsize=matplotlib.rcParams.get('font.size'),
                     ha='left',
                     backgroundcolor='#FFFFFFDD')

        # Fix y labels and limits
        if nrows > 1:
            ax.set_ylabel("")

        if 'normalized' in plot_metric:
            YMAX = 1
            YSTEP = 0.2
            ax.set_ylim(0, YMAX)
            if index == 0:
                ax.set_yticks(numpy.arange(0, YMAX + YSTEP, YSTEP))
            else:
                ax.set_yticks(numpy.arange(0, YMAX, YSTEP))
        else:
            ax.set_ylim(0, abs_ymax)
            ax.yaxis.set_major_locator(matplotlib.ticker.AutoLocator())
            print("Setting ymax to %f" % abs_ymax)

        # Recolor bars and remove patches
        for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
            if bar.get_width() == 86400:
                pass
            else:
                bar.set_visible(False)

    print("Showing", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
    print("Test platform:", TEST_PLATFORM)
    print("SMA window:", SHORT_WINDOW)
    fig.subplots_adjust(hspace=0.0, wspace=0.0)
    
    # Draw the legend
    fig.axes[-1].legend()

    if nrows > 1:
        fig.text(0.02, 0.5,
                 "Fraction Peak Performance",
                 verticalalignment='center',
                 horizontalalignment='center',
                 rotation='vertical',
                 fontsize=matplotlib.rcParams.get('font.size'))

    return fig

In [None]:
benchmark_ids_list = [
    ['ior_shared_write', 'ior_shared_read', '(a) IOR/s write', '(b) IOR/s read'],
    ['ior_fpp_write', 'ior_fpp_read', '(a) IOR/fpp write', '(b) IOR/fpp read'],
    ['vpicio_uni_shared_write', 'dbscan_read_shared_read', '(a) VPIC write', '(b) BDCATS read'],
    ['hacc_io_write_fpp_write', 'hacc_io_read_fpp_read', '(a) HACC write', '(b) HACC read'],
]
for benchmark_ids in benchmark_ids_list:
    fig = draw_fig5(filtered_df=filtered_df,
                     plot_metric='darshan_agg_perf_by_slowest_posix_gibs',
                     benchmark_ids=benchmark_ids[0:2],
                     labels=benchmark_ids[2:4])

    output_file = "figs/longterm-%s-%s.pdf" % (TEST_PLATFORM.split('@', 1)[0], benchmark_ids[0].split('_')[0])
    fig.savefig(output_file, bbox_inches='tight')
    print("Saved to", output_file)

In [None]:
fig = draw_fig5(filtered_df=filtered_df,
                plot_metric=plot_metric,
                benchmark_ids=['hacc_io_write_fpp_write'],
                labels=["HACC write on Cori"],
                draw_region_bounds=30*86400)
fig.set_size_inches(8, 4)
ax = fig.axes[0]
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

output_file = "figs/longterm-%s-hacc-write.pdf" % (TEST_PLATFORM.split('@', 1)[0])
fig.savefig(output_file, bbox_inches='tight')
print("Saved to", output_file)

## Difference between SMAs

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(12,4))

#for index, _benchmark_id in enumerate(['hacc_io_write_fpp_write', 'hacc_io_read_fpp_read']):
for index, _benchmark_id in enumerate(filtered_df['_benchmark_id'].unique()):
    _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))

    sma_short = abcutils.features.calculate_sma(_example_df,
                                                '_datetime_start',
                                                plot_metric,
                                                window=SHORT_WINDOW)
    if LONG_WINDOW.days > 365*2:
        # use the global mean rather than rely on a sufficiently long window to calculate it--just to be safe!
        sma_long = pandas.Series(_example_df[plot_metric].mean(),
                                 index=sma_short.index)
    else:
        sma_long = abcutils.features.calculate_sma(_example_df,
                                                   '_datetime_start',
                                                   plot_metric,
                                                   window=LONG_WINDOW)

    x = sma_short.index.map(lambda x: int(time.mktime(x.timetuple()) / 86400) * 86400)
    y = sma_short - sma_long

    ax.grid()
    ax.plot(x, y, label=abcutils.config.CONFIG['benchmark_labels_short'].get(_benchmark_id, _benchmark_id))
    ax.set_ylabel("Fraction Peak Performance\n$SMA_{short} - SMA_{long}$")
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d, %Y")

ax.legend(bbox_to_anchor=(1.05, 1.00), loc='upper left')
ax.grid()

## Calculate Simple Moving Deviation

The following plot answers the question of if performance _variation_ changes over time.

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(12,4))

for index, _benchmark_id in enumerate(filtered_df['_benchmark_id'].unique()):
    _example_df = filtered_df.groupby(by=group_by).get_group((TEST_PLATFORM, _benchmark_id))

    # calculate the simple moving standard deviation (SMSD?)
    x_column = '_datetime_start'
    y_column = plot_metric
    window = SHORT_WINDOW
    half_window = window / 2.0
    indices = []
    values = []
    for index, start, _ in _example_df[[x_column, y_column]].itertuples():
        window_start = start - half_window
        window_end = start + half_window
        window_df = _example_df[_example_df[x_column] >= window_start]
        window_df = window_df[window_df[x_column] < window_end]
        indices.append(window_df.loc[index][x_column])
        values.append(window_df[y_column].std())
    smsd_short = pandas.Series(values, index=indices, name=y_column).sort_index()

    x = smsd_short.index.map(lambda x: int(time.mktime(x.timetuple()) / 86400) * 86400)
    y_filt = _example_df['_datetime_start'] > _example_df['_datetime_start'].max() - SHORT_WINDOW
    y = smsd_short - _example_df[y_filt][plot_metric].std()
#   y = smsd_short

    ax.grid()
    ax.plot(x, y, label=abcutils.config.CONFIG['benchmark_labels_short'].get(_benchmark_id, _benchmark_id))
    ax.set_ylabel("Sliding stdev relative to\nstdev over last %s-day window$" % SHORT_WINDOW.days)
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d, %Y")

ax.legend(bbox_to_anchor=(1.05, 1.00), loc='upper left')
ax.grid()