# Validate Mira CSV

In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import pandas
import numpy
import scipy.stats
import tokio
import abcutils
import seaborn
import matplotlib
matplotlib.rcParams.update({'font.size': 16})

## Load and Synthesize Data from CSV

In [None]:
TEST_PLATFORMS = [
    'mira-fs1@mira'
]
BENCHMARK_IDS = [
    'ior_shared_write',
    'ior_fpp_write',
    'hacc_io_write_fpp_write',
    'vpicio_uni_shared_write',
    'ior_shared_read',
    'ior_fpp_read',
    'hacc_io_read_fpp_read',
    'dbscan_read_shared_read'
]

plot_metric = 'darshan_normalized_perf_by_max'

group_by = ['_test_platform', '_benchmark_id']

In [None]:
# Define regions of the heatmap to zoom in on, highlight in
# the overview, and draw accompaying graphics.
datetime_start = datetime.datetime(2017, 9, 12)
datetime_end = datetime.datetime(2017, 10, 2)
zoom_areas_new = [
    {
        'x_start': time.mktime(datetime_start.timetuple()),
        'x_end': time.mktime(datetime_end.timetuple()),
        'y_filter': lambda x: x.startswith('mira-fs1@mira'),
    },
]

## Helper Functions

In [None]:
def value2label(test_platform, benchmark_id):
    return "%s, %s" % (test_platform, benchmark_id)

In [None]:
def generate_x_y_filters(timeseries, x_start, x_end, y_filter):
    """Generate boolean arrays for subselecting timeseries.timeseries and timeseries.columns
    """
    x_filter = (timeseries.timestamps >= x_start) & (timeseries.timestamps < x_end)
    y_filter = numpy.array([y_filter(y) for y in numpy.array(timeseries.columns)])
    return x_filter, y_filter
    
def subselect_timeseries(timeseries, x_start, x_end, y_filter):
    """Convert a timeseries and y axis filtering criteria into a subselected dataset
    
    Args:
        timeseries (tokio.timeseries.TimeSeries): Dataset to convert to x, y, z values
        x_start (datetime.datetime): Starting date, inclusive
        x_end (datetime.datetime): Ending date, exclusive
        y_filter (lambda): Boolean function to which each value of ts.columns is passed
            to determine if that column of `timeseries` should be included
    Returns:
        tuple: x, y, z values, all of type numpy.ndarray, corresponding to x labels (1d vector),
        y labels (1d vector), and the data to plot (2d matrix).  Can be passed directly to
        draw_perf_summary()
    """
    x_filter, y_filter = generate_x_y_filters(timeseries, x_start, x_end, y_filter)

    # Create a boolean matrix to subselect the full dataset
    xy_filter = (numpy.tile(x_filter[None].T, (1, ts.dataset.shape[1])) &
                numpy.tile(y_filter[None], (ts.dataset.shape[0], 1)))

    # Set the x and y values based on the boolean arrays
    x = timeseries.timestamps[x_filter]
    y = numpy.array(timeseries.columns)[y_filter]
    # Slicing a matrix with another matrix returns a vector; reshape it to fit the slicing matrix
    z = timeseries.dataset[xy_filter].reshape((x_filter.sum(), y_filter.sum()))
    
    return x, y, z

In [None]:
def draw_perf_summary(x, y, z, ax=None):
    if ax is None:
        fig = matplotlib.pyplot.figure()
        fig.set_size_inches(16, (2.5 * len(TEST_PLATFORMS)))
        fig.subplots_adjust(hspace=0.0, wspace=0.01)

        gridspec = matplotlib.gridspec.GridSpec(nrows=1, ncols=2, width_ratios=[16, 1])

        ax = fig.add_subplot(gridspec[0])
    else:
        gridspec = None
        fig = ax.get_figure()

    cmap = matplotlib.cm.gist_heat
    cmap.set_bad(color='#666666', alpha=0.85)
    plotface = ax.pcolormesh(x,
                             numpy.arange(z.shape[1]+1),
                             numpy.ma.masked_where(z.T == 0.0, z.T),
                             cmap=cmap,
                             linewidth=0,
                             rasterized=True)

    # Set x ticks
    column_labels = []
    for label in y:
        for key, val in abcutils.CONFIG['benchmark_labels_short'].items():
            label = label.replace(key, val)
        label = label.replace('scratch1@edison', 'Edison')
        label = label.replace('cscratch@cori-knl', 'Cori')
        label = label.replace('mira-fs1@mira', 'Mira')
        column_labels.append(label)
    ax.set_yticks(numpy.arange(len(column_labels)) + 0.5)
    ax.set_yticklabels(column_labels, ha='right')

    abcutils.plot.fix_xticks_timeseries(ax)

    if gridspec is not None:
        # Set colorbar
        ax = fig.add_subplot(gridspec[1])
        cbar = matplotlib.pyplot.colorbar(plotface, cax=ax)
        ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))

    return fig.axes[0]

In [None]:
columns = []
for benchmark_id in BENCHMARK_IDS:
    for test_platform in TEST_PLATFORMS:
        columns.append(value2label(test_platform, benchmark_id))

## Draw new dataset figure

In [None]:
filtered_df = abcutils.sc18paper.load_dataset(input_datasets={'mira': 'summaries/mira-summaries_2017-02-14_2018-02-15.csv.gz'},
                                          cache_file='summaries/mira-summaries_2017-02-14_2018-02-15.hdf5')

ts = tokio.timeseries.TimeSeries(dataset_name='blah',
                                 start=abcutils.sc18paper.DATE_START,
                                 end=abcutils.sc18paper.DATE_END,
                                 timestep=86400,
                                 num_columns=len(columns),
                                 column_names=sorted(columns),
                                 sort_hex=False)
example_df = filtered_df.copy()

test_filter = example_df['_benchmark_id'].isin(BENCHMARK_IDS)
test_filter &= example_df['_test_platform'].isin(TEST_PLATFORMS)
rename_filter = {
    '_datetime_start': 'datetime_start',
    '_benchmark_id': 'benchmark_id',
    '_test_platform': 'test_platform',
}
for row in example_df[test_filter].rename(rename_filter, axis='columns', inplace=False).itertuples():
    ts.insert_element(row.datetime_start.to_pydatetime(),
                      value2label(row.test_platform, row.benchmark_id),
                      row._asdict()[plot_metric])

In [None]:
# New style based on datetimes
for zoom in zoom_areas_new:
    x, y, z = subselect_timeseries(ts, zoom['x_start'], zoom['x_end'], zoom['y_filter'])
    ax = draw_perf_summary(x, y, z)
    fig = ax.get_figure()
    fig.set_size_inches(4, (2.5 / 8 * len(y)))
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

    fig.axes[1].set_visible(False)

## Generate the SC18 figure

In [None]:
sc18_df = abcutils.sc18paper.load_dataset(input_datasets={'mira': 'summaries/alcf-tokio-results-2_14_17-2_15_18.csv.gz'},
                                          cache_file='summaries/alcf-tokio-results-2_14_17-2_15_18.hdf5')

ts = tokio.timeseries.TimeSeries(dataset_name='blah',
                                 start=abcutils.sc18paper.DATE_START,
                                 end=abcutils.sc18paper.DATE_END,
                                 timestep=86400,
                                 num_columns=len(columns),
                                 column_names=sorted(columns),
                                 sort_hex=False)
example_df = sc18_df.copy()

test_filter = example_df['_benchmark_id'].isin(BENCHMARK_IDS)
test_filter &= example_df['_test_platform'].isin(TEST_PLATFORMS)
rename_filter = {
    '_datetime_start': 'datetime_start',
    '_benchmark_id': 'benchmark_id',
    '_test_platform': 'test_platform',
}
for row in example_df[test_filter].rename(rename_filter, axis='columns', inplace=False).itertuples():
    ts.insert_element(row.datetime_start.to_pydatetime(),
                      value2label(row.test_platform, row.benchmark_id),
                      row._asdict()[plot_metric])

In [None]:
# New style based on datetimes
for zoom in zoom_areas_new:
    x, y, z = subselect_timeseries(ts, zoom['x_start'], zoom['x_end'], zoom['y_filter'])
    ax = draw_perf_summary(x, y, z)
    fig = ax.get_figure()
    fig.set_size_inches(4, (2.5 / 8 * len(y)))
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %d", criteria=(lambda x: x.weekday() == 6))

    fig.axes[1].set_visible(False)

## Inspect values

In [None]:
region_filter = (filtered_df['_test_platform'] == 'mira-fs1@mira')
region_filter = (filtered_df['_benchmark_id'] == 'vpicio_uni_shared_write')
region_filter &= (filtered_df['_datetime_start'] >= datetime_start - datetime.timedelta(days=2))
region_filter &= (filtered_df['_datetime_start'] < datetime_end)
filtered_df[region_filter]['_datetime_start']

In [None]:
region_filter = (sc18_df['_test_platform'] == 'mira-fs1@mira')
region_filter = (sc18_df['_benchmark_id'] == 'vpicio_uni_shared_write')
region_filter &= (sc18_df['_datetime_start'] >= datetime_start - datetime.timedelta(days=2))
region_filter &= (sc18_df['_datetime_start'] < datetime_end)
sc18_df[region_filter]['_datetime_start']