In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import time
import datetime
import warnings
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
#   'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-28.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2018-02-28.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

## Calculate the signed pair correlation function

In [None]:
def pdf(dataset, loci, xmin, xmax, delta, norm=False, geom=True):
    """Calculate the pair distribution function for a dataset
    
    Calculate the autocorrelation of a metric.
    
    Args:
        dataset (list of tuples): list of (x, y) tuples over which the PDF should be calculated
        loci (list): x values of interest around which the PDF should be calculated
        xmin: minimum value of x in the resulting PDF
        xmax: maximum value of x in the resulting PDF
        delta: resolution of PDF function expressed in the same units of x
        norm: express PDF in terms of fraction performance relative to each locus
        geom: use geometric mean instead of arithmetic mean
    Returns:
        (xbins, ybins, nbins) where
        xbins are the x values of the pair distribution function
        ybins are the y values of the pair distribution function
        nbins are the number of y values that fell into each bin
    """
    width = xmax - xmin
    num_bins = long(width / delta)
    xbins = numpy.arange(0, num_bins, dtype='float64') * delta + xmin
    ybins = numpy.zeros(num_bins, dtype='float64') + 1.0
    nbins = numpy.zeros(num_bins, dtype='int64')

    # convert to dataframe so we can do some fancy indexing
    dataset_df = pandas.DataFrame(dataset, columns=['x', 'y']).set_index('x')
    for locus in loci:
        locus_y = dataset_df.loc[locus][0]
        for row in dataset_df.itertuples():
            x, y = row[0], row[1]

            # drop self correlation terms
            if locus == x:
                continue

            dx = x - locus
            
            if abs(dx) > width:
                continue

            # how many bins away from zero - we round up when necessary to
            # avoid jobs from the first day finishing a few minutes under
            # 24 hours and thereby falling in the 0th day's bin
            x_bin = long(round(dx / delta))
            x_bin -= long(round(xbins[0] / delta))
            
            # we drop everything in the 0th bin (e.g., if multiple jobs ran
            # on the same day) because they cause artifacts in autocorrelation
            if x_bin == 0:
                continue

            # normalize signal
            if norm:
                dataset_df
                y_val = y / locus_y
            else:
                y_val = y

            if x_bin < ybins.shape[0] and x_bin >= 0:
                if geom:
                    ybins[x_bin] *= y_val # product of performance
                else:
                    ybins[x_bin] += y_val # sum of performance
                nbins[x_bin] += 1 # sum of measurement count

    if geom:
        ybins = numpy.nan_to_num(numpy.power(ybins, nbins.astype('float64')**(-1))) # geometric mean perf per bin
    else:
        ybins = numpy.nan_to_num(numpy.divide(ybins, nbins)) # arithmetic mean perf per bin

    return xbins, ybins, nbins

In [None]:
pd2epoch = lambda x: time.mktime(x.to_pydatetime().timetuple())

In [None]:
group_by = ['_test_platform', '_benchmark_id']
metric_distributions = df.groupby(by=group_by).describe()

In [None]:
test_platform = 'scratch2@edison'
#benchmark_id = 'dbscan_read_shared_read'

#test_platform = 'mira-fs1@mira'
benchmark_id = 'ior_fpp_write'
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['darshan_total_gibs_posix'] > 1.0]
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

print "test_platform =", test_platform
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", date_start.isoformat()
print "date_end =", date_end.isoformat()

In [None]:
# use bottom 25% as loci
loci = filtered_df[filtered_df[plot_metric] < metric_distributions.loc[test_platform, benchmark_id][plot_metric]['25%']]
loci = [pd2epoch(x) for x in loci['_datetime_start']]

# use top 75% as loci
# loci = filtered_df[filtered_df[plot_metric] > metric_distributions.loc[test_platform, benchmark_id][plot_metric]['75%']]
# loci = [pd2epoch(x) for x in loci['_datetime_start']]

# use all data as loci
# loci = [x[0] for x in dataset]

print "Using %d loci" % (len(loci))

In [None]:
dataset = sorted([(pd2epoch(x[1]), x[2]) for x in filtered_df[['_datetime_start', plot_metric]].itertuples()], key=lambda x: x[0])
delta = datetime.timedelta(days=1).total_seconds()

xv, yv, ct = pdf(dataset=dataset,
                 loci=loci,
                 xmin=-0*delta,
                 xmax=+14*delta,
                 delta=delta,
                 norm=True)

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=3, ncols=1)
fig.set_size_inches(12,8)

x_raw = filtered_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_raw = filtered_df[plot_metric]

### plot the raw data
ax = axes[0]
ax.grid()
ax.plot(x_raw,
        y_raw,
        linestyle='-',
        marker='.')
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%b %d") for x in ax.get_xticks()])
fig.suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))

### plot the autocorrelation function
ax = axes[1]
ax.plot(xv / 86400, yv, linestyle='-', marker='o')
ax.grid()
ax.set_ylabel("Relative\nPerformance")
ax.set_xlabel("Days")
# ax.set_ylim(0.5, 1.5)

### plot the number of samples in each bin
ax = axes[2]
ax.plot(xv / 86400, ct)
ax.set_ylabel("# Samples")
ax.grid()

In [None]:
# Global plot parameters
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']

# Determine which plots to generate
test_platforms = sorted(df['_test_platform'].unique())
benchmark_ids = sorted(df['_benchmark_id'].unique())

#test_platforms = ['cscratch@cori-knl']
#benchmark_ids = ['dbscan_read_shared_read', 'vpicio_uni_shared_write']

num_days = 14

# Generate plots
for test_platform in test_platforms:
    fig, axes = matplotlib.pyplot.subplots(nrows=len(benchmark_ids),
                                           ncols=1,
                                           sharex=True,
#                                          sharey=True,
                                          )
    fig.set_size_inches(20, 4 * len(benchmark_ids))
    axes[-1].set_xlabel("Days")

    for index, benchmark_id in enumerate(benchmark_ids):
        ax = axes[index]
        ax.set_xlim(0, num_days - 1)
        ax.set_ylim(1.0, 2.5)
        ax.set_xticks(range(num_days))
        ax.grid()
        ax.set_ylabel("Relative Performance")
        ax.set_title("%s %s" % (test_platform, benchmark_id),
                    **{'x': 0.01, 'y': 0.02, 'horizontalalignment': 'left'})
        
        try:
            filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
            metric_distributions = df.groupby(by=group_by).describe()
        except KeyError:
            continue
        filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
        filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

        dataset = sorted([(pd2epoch(x[1]), x[2]) for x in filtered_df[['_datetime_start', plot_metric]].itertuples()], key=lambda x: x[0])
        delta = datetime.timedelta(days=1).total_seconds()

#       loci = [x[0] for x in dataset]
        loci = filtered_df[filtered_df[plot_metric] < metric_distributions.loc[test_platform, benchmark_id][plot_metric]['25%']]
        loci = [pd2epoch(x) for x in loci['_datetime_start']]

        xv, yv, ct = pdf(dataset=dataset,
                         loci=loci,
                         xmin=0*delta,
                         xmax=(num_days)*delta,
                         delta=delta,
                         norm=True)

        ax.plot(xv / 86400, yv, linestyle='-', marker='o')