In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
# matplotlib.rcParams.update({'font.family': 'serif'})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
# TEST_PLATFORM = 'scratch2@edison'
# TEST_PLATFORM = 'cscratch@cori-knl'
# TEST_PLATFORM = 'cscratch@cori-haswell'
TEST_PLATFORM = 'mira-fs1@mira'

# BENCHMARK_ID = 'ior_fpp_write'
# BENCHMARK_ID = 'ior_fpp_read'
# BENCHMARK_ID = 'dbscan_read_shared_read'
# BENCHMARK_ID = 'vpicio_uni_shared_write'
# BENCHMARK_ID = 'ior_shared_write'
# BENCHMARK_ID = 'ior_shared_read'
# BENCHMARK_ID = 'hacc_io_read_fpp_read'
# BENCHMARK_ID = 'hacc_io_write_fpp_write'

plot_metric = 'darshan_normalized_perf_by_max'
delta = datetime.timedelta(days=1).total_seconds()

# example_df = filtered_df.groupby(by=['_test_platform', '_benchmark_id']).get_group((TEST_PLATFORM, BENCHMARK_ID)).copy()

print "test_platform =", TEST_PLATFORM
# print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_normalized_perf_by_max',
    'coverage_factor_bw',
#   'coverage_factor_nodehrs',
    'coverage_factor_opens',
    'coverage_factor_stats',
    'coverage_factor_ops',
    'fs_ave_mds_cpu',
#   'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
#   'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
#   'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Region-defined Correlation

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=49)

print "Short window will average over %s at a time" % SHORT_WINDOW
print "Long window will average over %s at a time" % LONG_WINDOW

## Calculate intercepts and centroids from SMAs

* **Intercepts** are the place where two SMAs cross each other
* **Performance regions** are the data bounded by two intercepts
* **Centroids** are the centermost data point in a performance region

With this nomenclature, it is possible to also define **centroid regions** which are bounded by two centroids.  These regions capture the transition between two performance regions.

In [None]:
pcutoff = 1.0e-5
#pcutoff = 0.05

In [None]:
results = {
    'test_platform': [],
    'region_start': [],
    'region_end': [],
    'region_start_index': [],
    'region_end_index': [],
    'metric': [],
    'coeff': [],
    'pvalue': [],
    'region_points': []
}
identified_regions = []

for test_platform in filtered_df['_test_platform'].unique():
    print "Processing", test_platform
    example_df = filtered_df.groupby(by=['_test_platform']).get_group((test_platform))

    sma_centroids = abcutils.features.sma_intercepts(example_df,
                                                    plot_metric,
                                                    short_window=SHORT_WINDOW,
                                                    long_window=LONG_WINDOW)

    for region in list(abcutils.features.intercepts_to_region(example_df, sma_centroids)):
        x = region[plot_metric].values
        base_nan_filter = numpy.isnan(x)
        title = "%s - %s (%d points)" % (
            region.iloc[0]['_datetime_start'],
            region.iloc[-1]['_datetime_start'],
            len(x[~base_nan_filter])
        )
        
        if len(x[~base_nan_filter]) < 3:
            # two points will create a correlation with p-value = 0
            continue
        
        identified = False
        for y_label in umami_rows: #example_df.columns: #umami_rows:
            if y_label == plot_metric:
                continue
            y = example_df.loc[region.index][y_label].values
            try:
                nan_filter = base_nan_filter | numpy.isnan(y)
            except TypeError:
                # non-numeric; pass
                continue
            this_x = x[~nan_filter]
            this_y = y[~nan_filter]
            if len(this_y) > 0:
                coeff, pval = scipy.stats.pearsonr(this_x, this_y)
                if pval < pcutoff and coeff < 0.9999:
                    if not identified:
                        print "new region for %s: %s" % (test_platform, title)
                    results['test_platform'].append(test_platform)
                    results['region_start'].append(region.iloc[0]['_datetime_start'])
                    results['region_end'].append(region.iloc[-1]['_datetime_start'])
                    results['region_start_index'].append(region.index[0])
                    results['region_end_index'].append(region.index[-1])
                    results['metric'].append(y_label)
                    results['coeff'].append(coeff)
                    results['pvalue'].append(pval)
                    results['region_points'].append(len(x[~base_nan_filter]))
#                   fig, ax = matplotlib.pyplot.subplots()
#                   ax.scatter(this_x, this_y)
#                   ax.set_xlabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
#                   ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(y_label, y_label).replace(' (', '\n('))
#                   ax.grid()
#                   fit = scipy.stats.linregress(this_x, this_y)
#                   ax.set_xlim(0, 1)
#                   ax.plot(ax.get_xticks(),
#                           [fit.slope * xi + fit.intercept for xi in ax.get_xticks()],
#                           color='C2',
#                          linewidth=3)
                    print "    Fit for %20s: R = %8.4f, p = %12.4e" % (y_label, coeff, pval)
#                   print "    Coefficient: %12.4f" % coeff
#                   print "    p-value:     %12.4e" % pval
#                   print "    Slope:       %12.4f" % fit.slope
#                   print "    Rval:        %12.4e" % fit.rvalue
                    print
                    identified = True

        # Keep track of regions that have known root causes
        if identified:
            identified_regions.append(region)

results_df = pandas.DataFrame.from_dict(results)
results_df

## Gather all correlations

In [None]:
plot_bars = []

test_platform_group = results_df.groupby('test_platform')
for test_platform in test_platform_group.groups:
    metric_group = test_platform_group.get_group(test_platform).groupby('metric')
    for metric in metric_group.groups:
        coeffs = metric_group.get_group(metric)['coeff']
        pvals = metric_group.get_group(metric)['pvalue']
        print "%20s %30s pos: %2d (R=%8.4f), neg: %2d (R=%8.4f), p: %12.4e" % (test_platform,
                                                metric, 
                                                coeffs[coeffs > 0].count(),
                                                coeffs[coeffs > 0].mean(),
                                                coeffs[coeffs < 0].count(),
                                                coeffs[coeffs < 0].mean(),
                                                pvals.mean())
        

In [None]:
results_df

In [None]:
def newxlabel(oldlabel):
    if '@' in oldlabel:
        fs, sys = oldlabel.split('@', 1)
        fs = fs.lstrip('(')
        sys = sys.rstrip('),')
        if sys == 'cori-knl':
            sys = 'Cori'
        else:
            sys = sys.title()
        return "%s\n%s" % ( sys, fs)
    else:
        return abcutils.CONFIG['umami_rows'].get(oldlabel, oldlabel)

In [None]:
XPAD = 0.5
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 4))

grouped_df = results_df[results_df['pvalue'] < 1e-5].groupby(['test_platform', 'metric'])

last_sys = None
x_offsets = [0.5]
x_labels = ['']
x_regions = [0.0]
x_region_names = []
ymin, ymax = ax.set_ylim(-1.1, 1.1)
for group in grouped_df:
    test_platform, metric = group[0]
    group_data = grouped_df.get_group((test_platform, metric))
    
    if test_platform == last_sys or last_sys is None:
        x_offsets.append(x_offsets[-1] + 1.0)
    else:
        region_end = x_offsets[-1] + 0.5 * (1.0 + XPAD)
        ax.plot([region_end, region_end], [ymin, ymax], linestyle='-', color='black')
        x_offsets.append(x_offsets[-1] + (1.0 + XPAD))
        x_regions.append(region_end)
        x_region_names.append(last_sys)

    ax.scatter([x_offsets[-1]] * len(group_data),
            group_data['coeff'].values,
            marker='o',
            s=-20.0 * numpy.log10(group_data['pvalue']),
            facecolors='#00000044')

    last_sys = test_platform

    x_labels.append(newxlabel(metric))
x_regions.append(x_offsets[-1] + 0.5 * (1.0 + XPAD))
x_region_names.append(last_sys)
ax.set_xticks(x_offsets)
ax.set_xticklabels(x_labels, rotation=45, ha='right')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.plot([xmin, xmax], [0, 0], linestyle='-', color='black', linewidth=1)
ax.set_yticks(numpy.arange(-1.0, 1.1, 0.2))
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Correlation Coefficient")

xmin, xmax = ax.set_xlim(xmin, xmax - XPAD)
for iregion in range(1, len(x_regions)):
    width = x_regions[iregion] - x_regions[iregion-1]
    ax.text(x_regions[iregion-1] + width / 2.0,
            1.2,
            newxlabel(x_region_names[iregion-1]),
            fontsize=16,
            ha='center')



In [None]:
XPAD = 1.5
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 4))

grouped_df = results_df[results_df['pvalue'] < 1e-5].groupby(['metric', 'test_platform'])

last_sys = None
x_offsets = [0.0]
x_labels = ['']
x_regions = [0.0]
x_region_names = []
ymin, ymax = ax.set_ylim(-1.1, 1.1)
for group in grouped_df:
    test_platform, metric = group[0]
    group_data = grouped_df.get_group((test_platform, metric))
    if len(group_data) == 0:
        continue
        
    if test_platform == last_sys or last_sys is None:
        x_offsets.append(x_offsets[-1] + 1.0)
    else:
#       region_end = x_offsets[-1] + 1.0 * (XPAD)
        region_end = x_offsets[-1] + 0.5 * (1.0 + XPAD)
        ax.plot([region_end, region_end], [ymin, ymax], linestyle='-', color='black')
        x_regions.append(region_end)
        x_region_names.append(last_sys)
#       x_offsets.append(region_end + 1.0)
        x_offsets.append(x_offsets[-1] + (1.0 + XPAD))

    ax.scatter([x_offsets[-1]] * len(group_data),
            group_data['coeff'].values,
            marker='o',
            s=-20.0 * numpy.log10(group_data['pvalue']),
            facecolors='#00000044')

    last_sys = test_platform

    x_labels.append(newxlabel(metric))


x_regions.append(x_offsets[-1] + 1.0 * (XPAD))
x_offsets.append(x_regions[-1] + 1.0)
#x_regions.append(x_offsets[-1] + 0.5 * (1.0 + XPAD))
x_region_names.append(last_sys)
ax.set_xticks(x_offsets)
ax.set_xticklabels([x.replace('\n', ' ') for x in x_labels], rotation=45, ha='right')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.plot([xmin, xmax], [0, 0], linestyle='-', color='black', linewidth=1)
ax.set_yticks(numpy.arange(-1.0, 1.1, 0.2))
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Correlation Coefficient")

# Vertical text
#xmin, xmax = ax.set_xlim(xmin, xmax - XPAD / 2)
#for iregion in range(1, len(x_regions)):
#    width = x_regions[iregion] - x_regions[iregion-1]
#    ax.text(x_regions[iregion-1] + width - XPAD / 2,
#            -1.05,
#            newxlabel(x_region_names[iregion-1]),
#            fontsize=16,
#            ha='left',
#            va='bottom',
#            rotation=90)

xmin, xmax = ax.set_xlim(xmin, xmax - XPAD * 0.9)
for iregion in range(1, len(x_regions)):
    width = x_regions[iregion] - x_regions[iregion-1]
    ax.text(x_regions[iregion-1] + width / 2.0,
            1.2,
            newxlabel(x_region_names[iregion-1]).replace(' ', '\n'),
            fontsize=16,
            ha='center')
fig.savefig("figs/trend-correlations.pdf", bbox_inches='tight')

## Identify source of bimodality in fs_ave_oss_cpu

In [None]:
test_platform = 'cscratch@cori-knl'
interesting_metric = 'fs_ave_oss_cpu'

In [None]:
filter_criteria = results_df['metric'] == interesting_metric
filter_criteria &= results_df['test_platform'] == test_platform
results_df[filter_criteria]

In [None]:
example_df = filtered_df.groupby(by=['_test_platform']).get_group((test_platform))

In [None]:
sma_centroids = abcutils.features.sma_intercepts(example_df,
                                                plot_metric,
                                                short_window=SHORT_WINDOW,
                                                long_window=LONG_WINDOW)

In [None]:
cheat_filter = example_df['_benchmark_id'] == 'hacc_io_write_fpp_write'
ax = abcutils.plot.sma_overlaps(dataframe=example_df[cheat_filter],
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_overlaps=sma_centroids[0:0],
                                regioncolors=['#00000000', '#00000000'],
                                method='value')
fig = ax.get_figure()

In [None]:
# Erase the raw data
for patch in ax.patches:
    if patch.get_width() == 86400:
        patch.set_visible(False)

In [None]:
ymin, ymax = ax.set_ylim(0, 1)
for row in results_df[filter_criteria].itertuples():
    start = abcutils.core.pd2epoch(row.region_start)
    end = abcutils.core.pd2epoch(row.region_end)
    color = '#FF00002A' if row.coeff < 0.0 else '#0000FF2A'
    patch = ax.add_patch(matplotlib.patches.Rectangle(
        xy=(start, 0.0),
        width=(end - start),
        height=(ymax - ymin),
        facecolor=color))
#   ax.plot([start, start], [ymin, ymax], linestyle='--', color=color)
#   ax.plot([end, end], [ymin, ymax], linestyle='--', color=color)
fig

In [None]:
ax.get_figure().set_size_inches(8, 3)
abcutils.plot.fix_xticks_timeseries(ax)
ax.set_ylabel(ax.get_ylabel().replace('\n', ' '))
ax.get_lines()[0].set_label("$SMA_{short}$")
ax.get_lines()[1].set_label("$SMA_{long}$")
ax.legend(loc='lower right', bbox_to_anchor=(1.01, -0.04))

fig

In [None]:
output_file = "figs/%s-bimodal-%s.pdf" % (test_platform.split('@', 1)[0], interesting_metric.replace('_', ''))
print "Saving to", output_file
fig.savefig(output_file, bbox_inches='tight')