In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import pandas
import numpy
import scipy.stats
import tokio
import abcutils
import seaborn
import matplotlib
matplotlib.rcParams.update({'font.size': 16})

## Load and Synthesize Data from CSV

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'mira-fs1@mira'
]
BENCHMARK_IDS = [
    'ior_shared_write',
    'ior_fpp_write',
    'hacc_io_write_fpp_write',
    'vpicio_uni_shared_write',
    'ior_shared_read',
    'ior_fpp_read',
    'hacc_io_read_fpp_read',
    'dbscan_read_shared_read'
]

plot_metric = 'darshan_normalized_perf_by_max'

group_by = ['_test_platform', '_benchmark_id']

print("plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
print("date_start =", abcutils.sc18paper.DATE_START.isoformat())
print("date_end =", abcutils.sc18paper.DATE_END.isoformat())

## Matplotlib Heatmap

In [None]:
# Define regions of the heatmap to zoom in on, highlight in
# the overview, and draw accompaying graphics.
zoom_areas_new = [
    {
        'x_start': time.mktime(datetime.datetime(2017, 9, 14).timetuple()),
        'x_end': time.mktime(datetime.datetime(2017, 10, 4).timetuple()),
        'y_filter': lambda x: x.startswith('mira-fs1@mira'),
    },
    {
        'x_start': time.mktime(datetime.datetime(2017, 5, 25).timetuple()),
        'x_end': time.mktime(datetime.datetime(2017, 6, 14).timetuple()),
        'y_filter': lambda x: x.startswith('cscratch@cori-knl'),
    },
]

## Helper Functions

In [None]:
def value2label(test_platform, benchmark_id):
    return "%s, %s" % (test_platform, benchmark_id)

In [None]:
def generate_x_y_filters(timeseries, x_start, x_end, y_filter):
    """Generate boolean arrays for subselecting timeseries.timeseries and timeseries.columns
    """
    x_filter = (timeseries.timestamps >= x_start) & (timeseries.timestamps < x_end)
    y_filter = numpy.array([y_filter(y) for y in numpy.array(timeseries.columns)])
    return x_filter, y_filter
    
def subselect_timeseries(timeseries, x_start, x_end, y_filter):
    """Convert a timeseries and y axis filtering criteria into a subselected dataset
    
    Args:
        timeseries (tokio.timeseries.TimeSeries): Dataset to convert to x, y, z values
        x_start (datetime.datetime): Starting date, inclusive
        x_end (datetime.datetime): Ending date, exclusive
        y_filter (lambda): Boolean function to which each value of ts.columns is passed
            to determine if that column of `timeseries` should be included
    Returns:
        tuple: x, y, z values, all of type numpy.ndarray, corresponding to x labels (1d vector),
        y labels (1d vector), and the data to plot (2d matrix).  Can be passed directly to
        draw_perf_summary()
    """
    x_filter, y_filter = generate_x_y_filters(timeseries, x_start, x_end, y_filter)

    # Create a boolean matrix to subselect the full dataset
    xy_filter = (numpy.tile(x_filter[None].T, (1, ts.dataset.shape[1])) &
                numpy.tile(y_filter[None], (ts.dataset.shape[0], 1)))

    # Set the x and y values based on the boolean arrays
    x = timeseries.timestamps[x_filter]
    y = numpy.array(timeseries.columns)[y_filter]
    # Slicing a matrix with another matrix returns a vector; reshape it to fit the slicing matrix
    z = timeseries.dataset[xy_filter].reshape((x_filter.sum(), y_filter.sum()))
    
    return x, y, z

In [None]:
def draw_perf_summary(x, y, z, ax=None):
    if ax is None:
        fig = matplotlib.pyplot.figure()
        fig.set_size_inches(16, (2.5 * len(TEST_PLATFORMS)))
        fig.subplots_adjust(hspace=0.0, wspace=0.01)

        gridspec = matplotlib.gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[16, 1])

        ax = fig.add_subplot(gridspec[0])
    else:
        gridspec = None
        fig = ax.get_figure()

    cmap = matplotlib.cm.gist_heat
    cmap.set_bad(color='#666666', alpha=0.85)
    plotface = ax.pcolormesh(x,
                             numpy.arange(z.shape[1]+1),
                             numpy.ma.masked_where(z.T == 0.0, z.T),
                             cmap=cmap,
                             linewidth=0,
                             rasterized=True)

    # Set x ticks
    column_labels = []
    for label in y:
        for key, val in abcutils.CONFIG['benchmark_labels_short'].items():
            label = label.replace(key, val)
        label = label.replace('scratch1@edison', 'Edison')
        label = label.replace('cscratch@cori-knl', 'Cori')
        label = label.replace('mira-fs1@mira', 'Mira')
        column_labels.append(label)
    ax.set_yticks(numpy.arange(len(column_labels)) + 0.5)
    ax.set_yticklabels(column_labels, ha='right')

    abcutils.plot.fix_xticks_timeseries(ax)

    if gridspec is not None:
        # Set colorbar
        ax = fig.add_subplot(gridspec[1])
        cbar = matplotlib.pyplot.colorbar(plotface, cax=ax)
        ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))

    return fig.axes[0]

## Create data object

We use tokio.timeseries.TimeSeries objects since they are a convenient, mutable container for storing a matrix of data with named rows and columns.  I suppose we could've also used a DataFrame here to be more well aligned with the rest of the analysis notebooks.

In [None]:
columns = []
for benchmark_id in BENCHMARK_IDS:
    for test_platform in TEST_PLATFORMS:
        columns.append(value2label(test_platform, benchmark_id))

ts = tokio.timeseries.TimeSeries(dataset_name='blah',
                                 start=abcutils.sc18paper.DATE_START,
                                 end=abcutils.sc18paper.DATE_END,
                                 timestep=86400,
                                 num_columns=len(columns),
                                 column_names=sorted(columns),
                                 sort_hex=False)

In [None]:
example_df = filtered_df.copy()

test_filter = example_df['_benchmark_id'].isin(BENCHMARK_IDS)
test_filter &= example_df['_test_platform'].isin(TEST_PLATFORMS)
rename_filter = {
    '_datetime_start': 'datetime_start',
    '_benchmark_id': 'benchmark_id',
    '_test_platform': 'test_platform',
}
for row in example_df[test_filter].rename(rename_filter, axis='columns', inplace=False).itertuples():
    ts.insert_element(row.datetime_start.to_pydatetime(),
                      value2label(row.test_platform, row.benchmark_id),
                      row._asdict()[plot_metric])

## Draw global heatmap

In [None]:
x = ts.timestamps
y = ts.columns
z = ts.dataset

x, y, z = subselect_timeseries(ts, 0, 9999999999, lambda x: x.split(',', 1)[0] in TEST_PLATFORMS)

ax = draw_perf_summary(x, y, z)

XPAD = 0*86400 # expand the box in x so the thick lines don't cover data we want to highlight
for zoom in zoom_areas_new:
    x_filter, y_filter = generate_x_y_filters(ts, zoom['x_start'], zoom['x_end'], zoom['y_filter'])
    xmin = x_filter.nonzero()[0].min() - XPAD
    ymin = y_filter.nonzero()[0].min()
    xmax = x_filter.nonzero()[0].max() + XPAD
    ymax = y_filter.nonzero()[0].max() + 1
    xmin = zoom['x_start']
    xmax = zoom['x_end']

    xy = (xmin, ymin)
    width = xmax - xmin + XPAD
    height = ymax - ymin

    # Draw a thick black line and a thinner green line over it
    # to make the zoomed-in region highlights stand out amidst
    # the noisy heatmap
    ax.add_patch(matplotlib.patches.Rectangle(
                 xy=xy,
                 width=width,
                 height=height,
                 facecolor="#00000000",
                 edgecolor='#000000FF',
                 linewidth=8))
    print("Added patch at %s of width=%s and height=%s" % (xy, width, height))

    ax.add_patch(matplotlib.patches.Rectangle(
                 xy=xy,
                 width=width,
                 height=height,
                 facecolor="#00000000",
                 edgecolor='C2',
                 linewidth=4))
    
# ax.get_figure()

In [None]:
ax.get_figure().savefig('figs/summary-heatmap.pdf', bbox_inches='tight')

## Draw zoomed-in regions of interest

In [None]:
# Old style of rendering based on slices
#for zoom in zoom_areas:
#    x = ts.timestamps[zoom['x']]
#    y = ts.columns[zoom['y']]
#    z = ts.dataset[zoom['x'], zoom['y']]
#    ax = draw_perf_summary(x, y, z)
#    fig = ax.get_figure()
#    fig.set_size_inches(4, (2.5 / 8 * len(y)))
#    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

#    fig.axes[1].set_visible(False)

In [None]:
# New style based on datetimes
for zoom in zoom_areas_new:
    x, y, z = subselect_timeseries(ts, zoom['x_start'], zoom['x_end'], zoom['y_filter'])
    ax = draw_perf_summary(x, y, z)
    fig = ax.get_figure()
    fig.set_size_inches(4, (2.5 / 8 * len(y)))
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

    fig.axes[1].set_visible(False)

### Create a single diagram for the paper

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=1, ncols=len(zoom_areas_new), figsize=(4*len(zoom_areas_new), 3))

# Draw heatmaps
for index, zoom in enumerate(zoom_areas_new):
    ax = axes[index]
    x, y, z = subselect_timeseries(ts, zoom['x_start'], zoom['x_end'], zoom['y_filter'])
    draw_perf_summary(x, y, z, ax=ax)

# Add window dressing
alphabet = 'abcdefg'
for index, ax in enumerate(axes):
    yticklabels = []
    for label in ax.get_yticklabels():
        machine, benchmark = label.get_text().split(', ', 1)
        yticklabels.append(benchmark)
    ax.set_yticklabels(yticklabels)
    ax.set_title("(%s) %s" % (alphabet[index], machine), fontsize=16)
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

axes[1].set_yticklabels([])
fig.subplots_adjust(hspace=0.0, wspace=0.05)
ax.get_figure().savefig('figs/regions-heatmap.pdf', bbox_inches='tight')

## Additional Figures for Presentation

In [None]:
# Express start/end time as datetime objects
x_start = time.mktime(datetime.datetime(2017, 11, 1).timetuple())
x_end = time.mktime(datetime.datetime(2017, 12, 11).timetuple())

x, y, z = subselect_timeseries(ts, x_start, x_end, lambda x: x.startswith('mira-fs1@mira'))

# Draw the actual heatmap
ax = draw_perf_summary(x, y, z)

# Window dressing
ax.set_yticklabels([x.get_text().replace("Mira, ", "") for x in ax.get_yticklabels()])
fig = ax.get_figure()
fig.set_size_inches(4.0 / 20 * 40, (2.5 / 8 * len(y)))
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

fig.axes[1].set_visible(False)
fig.savefig('figs/heatmap-mira-badregion.pdf', bbox_inches='tight', transparent=True)

In [None]:
# Express start/end time as datetime objects
x_start = time.mktime(datetime.datetime(2017, 5, 7).timetuple())
x_end = time.mktime(datetime.datetime(2017, 6, 21).timetuple())
_test_platform = 'cscratch@cori-knl'
_benchmark_id = 'ior_shared_read'

x, y, z = subselect_timeseries(ts, x_start, x_end, lambda x: x.startswith(_test_platform))

# Draw the actual heatmap
ax = draw_perf_summary(x, y, z)

# Window dressing
ax.set_yticklabels([x.get_text().replace("Cori, ", "") for x in ax.get_yticklabels()])
fig = ax.get_figure()
fig.set_size_inches(4.0 / 20 * 40, (2.5 / 8 * len(y)))
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

fig.axes[1].set_visible(False)
fig.savefig('figs/heatmap-cori-transients.pdf', bbox_inches='tight', transparent=True)

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 2.5))
row_index = numpy.where(y=='%s, %s' % (_test_platform, _benchmark_id))[0][0]
ax.plot(x, z[:, row_index], 'o')
ax.set_title(ts.columns[row_index])
ax.set_ylim(0, 1)
ax.set_xlim(x_start, x_end)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))
ax.grid()

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 2.5))

# Add SMAs
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=49)

# Calculate boundaries
sma_intercepts = abcutils.features.sma_intercepts(example_df,
                                                  plot_metric,
                                                  short_window=SHORT_WINDOW,
                                                  long_window=LONG_WINDOW)

example_df = filtered_df.groupby(by=['_test_platform', '_benchmark_id']).get_group((_test_platform, _benchmark_id))
sma_long = abcutils.features.calculate_sma(example_df,
                                           '_datetime_start',
                                           plot_metric,
                                           window=LONG_WINDOW)
# plot region boundaries by hand
for row in sma_intercepts.itertuples():
    # divide by two because each bar is centered but we want
    # region boundaries to line up with the left edge
    # then rewind by one day because the beginning of each region is
    # the first data point of the _new_ region but we want to highlight
    # the end of the region
    x = abcutils.core.pd2epoch(row[1]) - 3*86400/2
    ax.plot([x, x],
            [0.0, 1.0],
            '--',
            color='black')

ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                           plot_metric=plot_metric,
                           short_window=SHORT_WINDOW,
                           long_window=LONG_WINDOW,
                           ax=ax,
                           sma_intercepts=None,
                           raw_data_interval=86400)

# Thicken up the lines
for line in ax.get_lines():
    line.set_linewidth(4.0)
    
ax.set_axisbelow(True)

for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
    if bar.get_width() == 86400:
        bar.set_alpha(1.0)
        bar.set_facecolor('#7FADD0')
#       bar.set_edgecolor('black') 
        pass
    
ax.set_xlim(x_start, x_end)


abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))
ax.get_figure().set_size_inches(8, 2.5)
ax.set_ylim(0.0, 1.0)
ax.get_figure().savefig('figs/heatmap-as-region_cori-ior-shared-read.pdf', bbox_inches='tight')

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 5), sharex=True)

x, y, z = subselect_timeseries(ts, x_start, x_end, lambda x: x.startswith(_test_platform))

# Draw the actual heatmap
ax = axes[0]
draw_perf_summary(x, y, z, ax=ax)

# Window dressing
ax.set_yticklabels([x.get_text().replace("Cori, ", "") for x in ax.get_yticklabels()])
ax.set_title("%s" % abcutils.config.CONFIG['platform_labels_public'].get(_test_platform, _test_platform),
             x=0.025,
             y=0.025,
             fontsize=matplotlib.rcParams.get('font.size'),
             ha='left',
             backgroundcolor="#FFFFFFDD")

##############################################################
ax = axes[1]

# plot region boundaries by hand
for row in sma_intercepts.itertuples():
    # divide by two because each bar is centered but we want
    # region boundaries to line up with the left edge
    # then rewind by one day because the beginning of each region is
    # the first data point of the _new_ region but we want to highlight
    # the end of the region
    x = abcutils.core.pd2epoch(row[1]) - 3*86400/2
    ax.plot([x, x],
            [0.0, 1.0],
            '--',
            color='black')

ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                           plot_metric=plot_metric,
                           short_window=SHORT_WINDOW,
                           long_window=LONG_WINDOW,
                           ax=ax,
                           sma_intercepts=None,
                           raw_data_interval=86400)

# Thicken up the lines
for line in ax.get_lines():
    line.set_linewidth(4.0)
    
ax.set_axisbelow(True)

for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
    if bar.get_width() == 86400:
        bar.set_alpha(1.0)
        bar.set_facecolor('#7FADD0')
#       bar.set_edgecolor('black') 
        pass
    
ax.set_xlim(x_start, x_end - 86400)
ax.set_ylim(0.0, 1.0)

##############################################################
fig.subplots_adjust(hspace=0.05, wspace=0.0)

ax.set_title("%s" % (abcutils.config.CONFIG['benchmark_labels_short'].get(_benchmark_id, _benchmark_id)),
             x=0.025,
             y=(0.025),
             fontsize=matplotlib.rcParams.get('font.size'),
             ha='left',
             backgroundcolor='#FFFFFFDD')

abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))
fig.savefig('figs/heatmap-sma-cori.pdf', bbox_inches='tight', transparent=True)

In [None]:
sma_date_filter = (sma_intercepts['_datetime_start'] >= datetime.datetime.fromtimestamp(x_start)) & \
              (sma_intercepts['_datetime_start'] < datetime.datetime.fromtimestamp(x_end))
sma_intercepts[sma_date_filter]

In [None]:
UMAMI_ROWS = abcutils.config.CONFIG['umami_row_order']
UMAMI_ROWS = [
    'darshan_normalized_perf_by_max',
#   'contention_ops',
    'contention_bw',
    'contention_opens',
    'contention_stats',
#   'coverage_factor_stats',
#   'fs_ave_oss_cpu',
#   'fs_ave_mds_cpu',
#   'fshealth_ost_most_full_pct',
    'topology_job_max_radius',
]

x_region_start = sma_intercepts[sma_date_filter]['_datetime_start'].iloc[1] - pandas.Timedelta('2 days')
x_region_end = sma_intercepts[sma_date_filter]['_datetime_start'].iloc[2]
date_filter = (example_df['_datetime_start'] >= x_region_start) & \
              (example_df['_datetime_start'] < x_region_end)

axes = abcutils.plot.generate_umami(
        dataframe=example_df[date_filter],
        plot_metrics=UMAMI_ROWS,
        highlight_index=1)

for index, ax in enumerate(axes):
    if index % 2 == 1:
        ax.set_visible(False)
    elif index == 6:
        ax.set_ylim(0.9995, 1.0001)

axes[0].get_figure().savefig('figs/umami-cori-heatmapped-region.pdf', bbox_inches='tight', transparent=True)

In [None]:
example_df[date_filter]['contention_stats']

In [None]:
all_contributors = None
try:
    all_contributors = pandas.read_hdf('contributors.hdf5', 'contributors')
except IOError:
    pass

if all_contributors is not None:
    pass

In [None]:
contrib_filter = (all_contributors['region_start'] >= x_region_start) & \
                 (all_contributors['region_start'] < x_region_end) & \
                 (all_contributors['_benchmark_id'] == _benchmark_id) & \
                 (all_contributors['_test_platform'] == _test_platform) & \
                 (all_contributors['target_metric_matches'])

all_contributors[contrib_filter]