In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import time
import datetime
import warnings
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
#   'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Calculate the signed pair correlation function

Configure basic input parameters

In [None]:
group_by = ['_test_platform', '_benchmark_id']

In [None]:
#test_platform = 'scratch2@edison'
test_platform = 'cscratch@cori-knl'
#test_platform = 'mira-fs1@mira'

benchmark_id = 'ior_fpp_write'
# benchmark_id = 'dbscan_read_shared_read'
# benchmark_id = 'ior_shared_write'

plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
group_by = ['_test_platform', '_benchmark_id']
delta = datetime.timedelta(days=1).total_seconds()

example_df = filtered_df.groupby(by=group_by).get_group((test_platform, benchmark_id))

print "test_platform =", test_platform
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
sma_short = abcutils.features.calculate_sma(example_df, '_datetime_start', plot_metric, abcutils.features.SHORT_WINDOW)
sma_long = abcutils.features.calculate_sma(example_df, '_datetime_start', plot_metric, abcutils.features.LONG_WINDOW)

x_sma_short = [abcutils.core.pd2epoch(x) for x in sma_short.index]
y_sma_short = sma_short.values

x_sma_long = [abcutils.core.pd2epoch(x) for x in sma_long.index]
y_sma_long = sma_long.values

### Calculate the Autocorrelation Function

Using whatever definition of loci we chose above, calculate the autocorrelation function for the test conditions of interest and generate the ACF plot.

In [None]:
dataset = pandas.Series(data=example_df[plot_metric].values,
                        name=plot_metric,
                        index=[abcutils.core.pd2epoch(x) for x in example_df['_datetime_start']])

loci = [abcutils.core.pd2epoch(x) for x in abcutils.features.generate_loci_sma(example_df, plot_metric, mins=True, maxes=False)['_datetime_start']]

xv, yv, ct, std = abcutils.correlation.autocorrelation(dataset=dataset,
                                                       loci=loci,
                                                       xmin=0,
                                                       xmax=+28*delta,
                                                       delta=delta,
                                                       norm=True,
                                                       norm_by_locus=False,
                                                       agg_func=numpy.median,
                                                       stdev_func=lambda x: numpy.std(x, ddof=1))

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=3, ncols=1)
fig.set_size_inches(16,8)

x_raw = example_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_raw = example_df[plot_metric]

x_low = loci
y_low = dataset.loc[loci].values

### plot the raw data
ax = axes[0]
ax.grid()
ax.bar(x_raw, y_raw, width=delta, alpha=0.5)
ax.bar(x_low, y_low, width=delta, color='red')
# ax.scatter(x_low, y_low, marker='.', color='red')
# ax.plot(x_raw, y_raw, linestyle='-')
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric).replace(" ", "\n"))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%b %d") for x in ax.get_xticks()])
fig.suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))

### plot the simple moving averages
ax.plot(x_sma_short, y_sma_short, color='C1', linewidth=2)
ax.plot(x_sma_long, y_sma_long, color='C2', linewidth=2)

### plot the autocorrelation function
ax = axes[1]
ax.plot(xv / delta, yv, linestyle='-', marker='o')
#ax.errorbar(xv / 86400, yv, xerr=None, yerr=std, color='grey', capsize=4)
ax.grid()
ax.set_ylabel("Performance\nACF")
# ax.set_ylim(0.5, 1.0)

### plot the number of samples in each bin
#ax = axes[2]
#ax.plot(xv / delta, ct, linestyle='-', marker='o')
#ax.set_ylabel("# Samples")
#ax.grid()
ax = axes[2]
ax.plot(xv / delta, yv, linestyle='-', marker='o')
ax.errorbar(xv / delta, yv, xerr=None, yerr=std, color='grey', capsize=4)
ax.grid()
ax.set_ylabel("ACF w/ stdev")
ax.set_xlabel("Days")
pass

## Generate and plot ACFs for ALL benchmark data

This takes a _very_ long time to compute.

In [None]:
# Global plot parameters
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']

# Determine which plots to generate
test_platforms = sorted(filtered_df['_test_platform'].unique())
benchmark_ids = sorted(filtered_df['_benchmark_id'].unique())

#test_platforms = ['cscratch@cori-knl']
#benchmark_ids = ['dbscan_read_shared_read', 'vpicio_uni_shared_write']

num_days = 14
delta = datetime.timedelta(days=1).total_seconds()
grouped_df = filtered_df.groupby(by=group_by)

# Generate plots
for test_platform in test_platforms:
    fig, axes = matplotlib.pyplot.subplots(nrows=len(benchmark_ids),
                                           ncols=1,
                                           sharex=True)
    fig.set_size_inches(20, 4 * len(benchmark_ids))
    axes[-1].set_xlabel("Days")

    for index, benchmark_id in enumerate(benchmark_ids):
        ax = axes[index]
        ax.set_xlim(0, num_days - 1)
        ax.set_ylim(0.4, 1.2)
        ax.set_xticks(range(num_days))
        ax.grid()
        ax.set_ylabel("Relative Performance")
        ax.set_title("%s %s" % (test_platform, benchmark_id),
                    **{'x': 0.01, 'y': 0.02, 'horizontalalignment': 'left'})
        
        try:
            example_df = grouped_df.get_group((test_platform, benchmark_id))
            metric_distributions = filtered_df.groupby(by=group_by).describe()
        except KeyError:
            continue

        dataset = pandas.Series(data=example_df[plot_metric].values,
                                name=plot_metric,
                                index=[abcutils.core.pd2epoch(x) for x in example_df['_datetime_start']])
        loci = [abcutils.core.pd2epoch(x) for x in abcutils.features.generate_loci_sma(example_df,
                                                                                       plot_metric,
                                                                                       mins=True,
                                                                                       maxes=False)['_datetime_start']]
        
        if not loci:
            continue

        xv, yv, ct, std = abcutils.correlation.autocorrelation(dataset=dataset,
                                                               loci=loci,
                                                               xmin=0*delta,
                                                               xmax=num_days*delta,
                                                               delta=delta,
                                                               norm=True)

        ax.plot(xv / delta, yv, linestyle='-', marker='o')