In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import time
import datetime
import warnings
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-28.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2018-02-28.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

## Calculate the signed pair correlation function

In [None]:
pd2epoch = lambda x: time.mktime(x.to_pydatetime().timetuple())

In [None]:
#test_platform = 'scratch2@edison'
#benchmark_id = 'dbscan_read_shared_read'
test_platform = 'mira-fs1@mira'
benchmark_id = 'hacc_io_write_fpp_write'
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['darshan_total_gibs_posix'] > 1.0]
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

print "test_platform =", test_platform
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', plot_metric)
print "date_start =", date_start.isoformat()
print "date_end =", date_end.isoformat()

In [None]:
full_dataframe = filtered_df
loci = filtered_df

full_dataframe = filtered_df.copy()

In [None]:
def pdf_old(dataset, loci, xmin, xmax, delta, rcutoff=None, norm=False):
    if rcutoff:
        width = rcutoff
    else:
        width = xmax - xmin
    num_bins = long(width / delta)
    tot_bins = num_bins * 2 - 1
    xbins = numpy.hstack((numpy.arange(0, num_bins),
                          numpy.arange(- num_bins + 1, 0))) * delta
    ybins = numpy.zeros(tot_bins, dtype='float64')
    binct = numpy.zeros(tot_bins, dtype='int64')

    # convert to dataframe so we can do some fancy indexing
    dataset_df = pandas.DataFrame(dataset, columns=['x', 'y'])
    for row in dataset_df.itertuples():
        x, y = row[1], row[2]
        for locus in loci:
            # drop self correlation terms
            if locus == x:
                continue

            dx = x - locus
            
            if rcutoff and abs(dx) > rcutoff:
                continue

            if abs(dx) > width:
                warnings.warn("%s, %s: bin out of range" % x, y)
                continue
                
            x_bin = long(dx / delta)
            ybins[x_bin] += y # sum of performance
            binct[x_bin] += 1 # sum of measurement count
    
    # convert results from [0 to N, -N to -1] to [-N to +N]
    xbins = numpy.hstack((xbins[num_bins:], xbins[0:num_bins]))
    ybins = numpy.hstack((ybins[num_bins:], ybins[0:num_bins]))
    binct = numpy.hstack((binct[num_bins:], binct[0:num_bins]))

    ybins = numpy.nan_to_num(numpy.divide(ybins, binct)) # mean perf per bin
    if norm:    # normalize signal
        ybins /= numpy.nanmean(ybins)

    return xbins, ybins, binct

In [None]:
def pdf(dataset, loci, xmin, xmax, delta, norm=False):
    """Calculate the pair distribution function for a dataset
    
    Calculate the autocorrelation of a metric.
    
    Args:
        dataset (list of tuples): list of (x, y) tuples over which the PDF should be calculated
        loci (list): x values of interest around which the PDF should be calculated
        xmin: minimum value of x in the resulting PDF
        xmax: maximum value of x in the resulting PDF
        delta: resolution of PDF function expressed in the same units of x
        norm: express PDF in terms of fraction performance relative to each locus
    Returns:
        (xbins, ybins, nbins) where
        xbins are the x values of the pair distribution function
        ybins are the y values of the pair distribution function
        nbins are the number of y values that fell into each bin
    """
    width = xmax - xmin
    num_bins = long(width / delta)
    xbins = numpy.arange(0, num_bins, dtype='float64') * delta + xmin
    ybins = numpy.zeros(num_bins, dtype='float64') + 1.0
    nbins = numpy.zeros(num_bins, dtype='int64')

    # convert to dataframe so we can do some fancy indexing
    dataset_df = pandas.DataFrame(dataset, columns=['x', 'y']).set_index('x')
    for locus in loci:
        locus_y = dataset_df.loc[locus][0]
        for row in dataset_df.itertuples():
            x, y = row[0], row[1]

            # drop self correlation terms
            if locus == x:
                continue

            dx = x - locus
            
            if abs(dx) > width:
                continue
                
            x_bin = long(dx / delta) # how many bins away from zero
            x_bin -= long(xbins[0] / delta)
            
            if norm:    # normalize signal
                dataset_df
                y_val = y / locus_y
#               if y_val < 0.5 or y_val > 2.0:
#                   print datetime.datetime.fromtimestamp(locus), x_bin, y_val
            else:
                y_val = y

            if x_bin < ybins.shape[0] and x_bin >= 0:
                ybins[x_bin] *= y_val # product of performance
                nbins[x_bin] += 1 # sum of measurement count

#   ybins = numpy.nan_to_num(numpy.divide(ybins, nbins)) # arithmetic mean perf per bin
    ybins = numpy.nan_to_num(numpy.power(ybins, nbins.astype('float64')**(-1))) # geometric mean perf per bin

    return xbins, ybins, nbins

In [None]:
dataset = sorted([(pd2epoch(x[1]), x[2]) for x in filtered_df[['_datetime_start', plot_metric]].itertuples()], key=lambda x: x[0])
delta = datetime.timedelta(days=1).total_seconds()

loci = [x[0] for x in dataset]

xv, yv, ct = pdf(dataset=dataset,
                 loci=loci,
                 xmin=-0*delta,
                 xmax=+300*delta,
                 delta=delta,
                 norm=True)

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=3, ncols=1)
fig.set_size_inches(12,8)

x_raw = filtered_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_raw = filtered_df[plot_metric]

ax = axes[0]
ax.grid()
ax.plot(x_raw,
        y_raw,
        linestyle='-',
        marker='.')
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%b %d") for x in ax.get_xticks()])
fig.suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))
ax = axes[1]
ax.plot(xv / 86400, yv, linestyle='-')
ax.grid()
ax.set_ylabel("Relative\nPerformance")
ax.set_xlabel("Days")
ax.set_ylim(0.5, 1.5)

ax = axes[2]
ax.plot(xv / 86400, ct)
ax.set_ylabel("# Samples")
ax.grid()

In [None]:
# Global plot parameters
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']

# Determine which plots to generate
test_platforms = sorted(df['_test_platform'].unique())
benchmark_ids = sorted(df['_benchmark_id'].unique())
# benchmark_ids = ['ior_shared_write']

# Generate plots
for test_platform in test_platforms:
    fig, axes = matplotlib.pyplot.subplots(nrows=len(benchmark_ids),
                                           ncols=1,
                                           sharex=True,
#                                          sharey=True,
                                          )
    fig.set_size_inches(20, 4 * len(benchmark_ids))
    axes[-1].set_xlabel("Days")

    for index, benchmark_id in enumerate(benchmark_ids):
        ax = axes[index]
        ax.grid()
        ax.set_ylabel("Relative Performance")
        ax.set_title("%s %s" % (test_platform, benchmark_id),
                    **{'x': 0.01, 'y': 0.02, 'horizontalalignment': 'left'})
        try:
            filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
        except KeyError:
            continue
        filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
        filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

        dataset = sorted([(pd2epoch(x[1]), x[2]) for x in filtered_df[['_datetime_start', plot_metric]].itertuples()], key=lambda x: x[0])
        delta = datetime.timedelta(days=1).total_seconds()

        loci = [x[0] for x in dataset]

        xv, yv, ct = pdf(dataset=dataset,
                         loci=loci,
                         xmin=-300*delta,
                         xmax=+300*delta,
                         delta=delta,
                         norm=True)

        ax.plot(xv / 86400, yv, linestyle='-')