In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 18})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
plot_metric = 'darshan_normalized_perf_by_max'
delta = datetime.timedelta(days=1).total_seconds()

print("plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
print("date_start =", abcutils.sc18paper.DATE_START.isoformat())
print("date_end =", abcutils.sc18paper.DATE_END.isoformat())

In [None]:
# This cutoff indicates how statistically significant a correlation must
# be before it is plotted.  Lower p-values are more statistically significant.
pcutoff = 1.0e-5
print("P-value cutoff is", pcutoff)

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_normalized_perf_by_max',
    'contention_bw',
#   'contention_nodehrs',
    'contention_opens',
    'contention_stats',
    'contention_ops',
    'fs_ave_mds_cpu',
#   'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
#   'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
#   'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Region-defined Correlation

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=49)

print("Short window will average over %s at a time" % SHORT_WINDOW)
print("Long window will average over %s at a time" % LONG_WINDOW)

## Calculate correlations over each divergence region

In [None]:
results = {
    'test_platform': [],
    'region_start': [],
    'region_end': [],
    'region_start_index': [],
    'region_end_index': [],
    'metric': [],
    'coeff': [],
    'pvalue': [],
    'region_points': []
}
identified_regions = []

for test_platform in filtered_df['_test_platform'].unique():
    print("Processing", test_platform)
    example_df = filtered_df.groupby(by=['_test_platform']).get_group((test_platform))

    sma_centroids = abcutils.features.sma_intercepts(example_df,
                                                    plot_metric,
                                                    short_window=SHORT_WINDOW,
                                                    long_window=LONG_WINDOW)

    for region in list(abcutils.features.intercepts_to_region(example_df, sma_centroids)):
        x = region[plot_metric].values
        base_nan_filter = numpy.isnan(x)
        title = "%s - %s (%d points)" % (
            region.iloc[0]['_datetime_start'],
            region.iloc[-1]['_datetime_start'],
            len(x[~base_nan_filter])
        )
        
        if len(x[~base_nan_filter]) < 3:
            # two points will create a correlation with p-value = 0
            continue
        
        identified = False
        for y_label in umami_rows: #example_df.columns: #umami_rows:
            if y_label == plot_metric:
                continue
            y = example_df.loc[region.index][y_label].values
            try:
                nan_filter = base_nan_filter | numpy.isnan(y)
            except TypeError:
                # non-numeric; pass
                continue
            this_x = x[~nan_filter]
            this_y = y[~nan_filter]
            if len(this_y) > 0:
                coeff, pval = scipy.stats.pearsonr(this_x, this_y)
                if pval < pcutoff and coeff < 0.9999:
                    if not identified:
                        print("new region for %s: %s" % (test_platform, title))
                    results['test_platform'].append(test_platform)
                    results['region_start'].append(region.iloc[0]['_datetime_start'])
                    results['region_end'].append(region.iloc[-1]['_datetime_start'])
                    results['region_start_index'].append(region.index[0])
                    results['region_end_index'].append(region.index[-1])
                    results['metric'].append(y_label)
                    results['coeff'].append(coeff)
                    results['pvalue'].append(pval)
                    results['region_points'].append(len(x[~base_nan_filter]))
                    identified = True

        # Keep track of regions that have known root causes
        if identified:
            identified_regions.append(region)

results_df = pandas.DataFrame.from_dict(results)
results_df

## Gather all correlations

In [None]:
plot_bars = []

test_platform_group = results_df.groupby('test_platform')
for test_platform in test_platform_group.groups:
    metric_group = test_platform_group.get_group(test_platform).groupby('metric')
    for metric in metric_group.groups:
        coeffs = metric_group.get_group(metric)['coeff']
        pvals = metric_group.get_group(metric)['pvalue']
        print("%20s %30s pos: %2d (R=%8.4f), neg: %2d (R=%8.4f), p: %12.4e" % (test_platform,
                                                metric, 
                                                coeffs[coeffs > 0].count(),
                                                coeffs[coeffs > 0].mean(),
                                                coeffs[coeffs < 0].count(),
                                                coeffs[coeffs < 0].mean(),
                                                pvals.mean()))
        

In [None]:
def newxlabel(oldlabel):
    if '@' in oldlabel:
        label = abcutils.CONFIG['platform_labels_public'].get(oldlabel)
        if not label:
            # dated method for dynamically generating a human-readable label from the fs@host label
            fs, sys = oldlabel.split('@', 1)
            fs = fs.lstrip('(')
            sys = sys.rstrip('),')
            if sys == 'cori-knl':
                sys = 'Cori'
            else:
                sys = sys.title()
            return "%s\n%s" % ( sys, fs)
        else:
            return label
    else:
        return abcutils.CONFIG['umami_rows'].get(oldlabel, oldlabel).replace("CF", "Contention")

In [None]:
XPAD = 0.5
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 4))

grouped_df = results_df[results_df['pvalue'] < pcutoff].groupby(['test_platform', 'metric'])

last_sys = None
x_offsets = [0.5]
x_labels = ['']
x_regions = [0.0]
x_region_names = []
ymin, ymax = ax.set_ylim(-1.1, 1.1)
for group in grouped_df:
    test_platform, metric = group[0]
    group_data = grouped_df.get_group((test_platform, metric))
    
    if test_platform == last_sys or last_sys is None:
        x_offsets.append(x_offsets[-1] + 1.0)
    else:
        region_end = x_offsets[-1] + 0.5 * (1.0 + XPAD)
        ax.plot([region_end, region_end], [ymin, ymax], linestyle='-', color='black')
        x_offsets.append(x_offsets[-1] + (1.0 + XPAD))
        x_regions.append(region_end)
        x_region_names.append(last_sys)

    ax.scatter([x_offsets[-1]] * len(group_data),
            group_data['coeff'].values,
            marker='o',
            s=-20.0 * numpy.log10(group_data['pvalue']),
            facecolors='#00000044')

    last_sys = test_platform

    x_labels.append(newxlabel(metric))
x_regions.append(x_offsets[-1] + 0.5 * (1.0 + XPAD))
x_region_names.append(last_sys)
ax.set_xticks(x_offsets)
ax.set_xticklabels(x_labels, rotation=45, ha='right')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.plot([xmin, xmax], [0, 0], linestyle='-', color='black', linewidth=1)
ax.set_yticks(numpy.arange(-1.0, 1.1, 0.2))
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Correlation Coefficient")

xmin, xmax = ax.set_xlim(xmin, xmax - XPAD)
for iregion in range(1, len(x_regions)):
    width = x_regions[iregion] - x_regions[iregion-1]
    ax.text(x_regions[iregion-1] + width / 2.0,
            1.2,
            newxlabel(x_region_names[iregion-1]),
            fontsize=16,
            ha='center')

In [None]:
XPAD = 1.5
fig, ax = matplotlib.pyplot.subplots(figsize=(10, 4))

grouped_df = results_df[results_df['pvalue'] < pcutoff].groupby(['metric', 'test_platform'])

last_sys = None
x_offsets = [0.0]
x_labels = ['']
x_regions = [0.0]
x_region_names = []
ymin, ymax = ax.set_ylim(-1.0, 1.0)
for group in grouped_df:
    test_platform, metric = group[0]
    group_data = grouped_df.get_group((test_platform, metric))
    if len(group_data) == 0:
        continue
        
    if test_platform == last_sys or last_sys is None:
        x_offsets.append(x_offsets[-1] + 1.0)
    else:
        region_end = x_offsets[-1] + 0.5 * (1.0 + XPAD)
        ax.plot([region_end, region_end], [ymin, ymax], linestyle='-', color='black')
        x_regions.append(region_end)
        x_region_names.append(last_sys)
        x_offsets.append(x_offsets[-1] + (1.0 + XPAD))

    ax.scatter([x_offsets[-1]] * len(group_data),
            group_data['coeff'].values,
            marker='o',
            s=-40.0 * numpy.log10(group_data['pvalue']),
            facecolors='#00000044')

    last_sys = test_platform

    x_labels.append(newxlabel(metric))


x_regions.append(x_offsets[-1] + 1.0 * (XPAD))
x_offsets.append(x_regions[-1] + 1.0)
x_region_names.append(last_sys)
ax.set_xticks(x_offsets)
ax.set_xticklabels([x.replace('\n', ' ') for x in x_labels], rotation=30, ha='right')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.plot([xmin, xmax], [0, 0], linestyle='-', color='black', linewidth=1)
ax.set_yticks(numpy.arange(-1.0, 1.1, 0.2))
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Correlation Coefficient")

# Vertical text
#xmin, xmax = ax.set_xlim(xmin, xmax - XPAD / 2)
#for iregion in range(1, len(x_regions)):
#    width = x_regions[iregion] - x_regions[iregion-1]
#    ax.text(x_regions[iregion-1] + width - XPAD / 2,
#            -1.05,
#            newxlabel(x_region_names[iregion-1]),
#            fontsize=16,
#            ha='left',
#            va='bottom',
#            rotation=90)

xmin, xmax = ax.set_xlim(xmin, xmax - XPAD * 0.9)
for iregion in range(1, len(x_regions)):
    width = x_regions[iregion] - x_regions[iregion-1]
    label = newxlabel(x_region_names[iregion-1]).replace(' ', '\n')
    if label.startswith("Data"):
        label = label.replace("\n", " ", 1)
    label = label.replace("File\nSystem", "File System")
    ax.text(x_regions[iregion-1] + width / 2.0,
            1.0,
            label,
            fontsize=18,
            ha='center',
            va='bottom')
fig.savefig("figs/trend-correlations.pdf", bbox_inches='tight')

In [None]:
XPAD = 1.5
fig, ax = matplotlib.pyplot.subplots(figsize=(4, 4))

grouped_df = results_df[results_df['pvalue'] < pcutoff].groupby(['metric', 'test_platform'])

last_metric = None
x_offsets = [0.0]
x_labels = ['']
x_regions = [0.0]
x_region_names = []
ymin, ymax = ax.set_ylim(-1.0, 1.0)
for group in grouped_df:
    metric, test_platform = group[0]
    group_data = grouped_df.get_group((metric, test_platform))
    if len(group_data) == 0:
        continue
    if metric != "fs_ave_oss_cpu":
        continue
    if metric == last_metric or last_metric is None:
        x_offsets.append(x_offsets[-1] + 1.0)
    else:
        region_end = x_offsets[-1] + 0.5 * (1.0 + XPAD)
        ax.plot([region_end, region_end], [ymin, ymax], linestyle='-', color='black')
        x_regions.append(region_end)
        x_region_names.append(last_metric)
        x_offsets.append(x_offsets[-1] + (1.0 + XPAD))

    ax.scatter([x_offsets[-1]] * len(group_data),
            group_data['coeff'].values,
            marker='o',
            s=-40.0 * numpy.log10(group_data['pvalue']),
            facecolors='#00000044')

    last_metric = metric

    x_labels.append(newxlabel(test_platform))


x_regions.append(x_offsets[-1] + 1.0 * (XPAD))
x_offsets.append(x_regions[-1] + 1.0)
x_region_names.append(last_metric)
ax.set_xticks(x_offsets)
ax.set_xticklabels([x.replace('\n', ' ') for x in x_labels], rotation=30, ha='right')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.plot([xmin, xmax], [0, 0], linestyle='-', color='black', linewidth=1)
ax.set_yticks(numpy.arange(-1.0, 1.1, 0.2))
ax.grid()
ax.set_axisbelow(True)
ax.set_ylabel("Correlation Coefficient")

# Vertical text
#xmin, xmax = ax.set_xlim(xmin, xmax - XPAD / 2)
#for iregion in range(1, len(x_regions)):
#    width = x_regions[iregion] - x_regions[iregion-1]
#    ax.text(x_regions[iregion-1] + width - XPAD / 2,
#            -1.05,
#            newxlabel(x_region_names[iregion-1]),
#            fontsize=16,
#            ha='left',
#            va='bottom',
#            rotation=90)

xmin, xmax = ax.set_xlim(xmin, xmax - XPAD * 0.9)
for iregion in range(1, len(x_regions)):
    width = x_regions[iregion] - x_regions[iregion-1]
    label = newxlabel(x_region_names[iregion-1])
    ax.text(x_regions[iregion-1] + width / 2.0,
            1.0,
            label,
            fontsize=18,
            ha='center',
            va='bottom')
fig.savefig("figs/trend-correlations-only-cpuload.pdf", bbox_inches='tight')

### Also calculate the overall correlation

This demonstrates that targeted correlation is superior to trying to correlate with no time-dependent partitioning.

In [None]:
grouped_results = results_df[results_df['pvalue'] < pcutoff].groupby(['metric', 'test_platform'])
for group in grouped_results:
    metric, test_platform = group[0]
    group_data = grouped_results.get_group((metric, test_platform))
    group_raw_data = filtered_df.groupby(by=['_test_platform']).get_group((test_platform))

    print("\n===== %s ======\n" % test_platform)
    for correlated_metric in group_data['metric'].unique():
        x = group_raw_data[plot_metric].values
        y = group_raw_data[correlated_metric].values
        nan_filter = (numpy.isnan(x) | numpy.isnan(y))
        x = x[~nan_filter]
        y = y[~nan_filter]
        coeff, pval = scipy.stats.pearsonr(x, y)
        print("Global correlation between %s and %s:\n  coeff:   %12.4f\n  p-value: %12.4e" % (
            plot_metric,
            correlated_metric,
            coeff,
            pval))

## Identify source of bimodality in fs_ave_oss_cpu

In [None]:
test_platform = 'cscratch@cori-knl'
interesting_metric = 'fs_ave_oss_cpu'

In [None]:
filter_criteria = results_df['metric'] == interesting_metric
filter_criteria &= results_df['test_platform'] == test_platform
results_df[filter_criteria]

In [None]:
example_df = filtered_df.groupby(by=['_test_platform']).get_group((test_platform))

In [None]:
sma_centroids = abcutils.features.sma_intercepts(example_df,
                                                plot_metric,
                                                short_window=SHORT_WINDOW,
                                                long_window=LONG_WINDOW)

In [None]:
tmp = matplotlib.rcParams['font.size']
matplotlib.rcParams['font.size'] = 18

POS_CORRELATION_COLOR = 'C0'#'#0000FF'#2A'
NEG_CORRELATION_COLOR = 'C3'#'#FF0000'#2A'

cheat_filter = example_df['_benchmark_id'] == 'hacc_io_write_fpp_write'
ax = abcutils.plot.sma_overlaps(dataframe=example_df[cheat_filter],
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_overlaps=sma_centroids[0:0],
                                regioncolors=['#00000000', '#00000000'],
                                plotraw=False)

# Thicken the lines for presentation mode
for line in ax.get_lines():
    line.set_linewidth(4.0)

# Draw red and blue patches based on negative and positive correlations
ymin, ymax = ax.set_ylim(0, 1)
for row in results_df[filter_criteria].itertuples():
    start = abcutils.core.pd2epoch(row.region_start)
    end = abcutils.core.pd2epoch(row.region_end)
    color = NEG_CORRELATION_COLOR if row.coeff < 0.0 else POS_CORRELATION_COLOR
    patch = ax.add_patch(matplotlib.patches.Rectangle(
        xy=(start, 0.0),
        width=(end - start),
        height=(ymax - ymin),
        facecolor=color))

# Set legend and figure size
ax.get_figure().set_size_inches(8, 5)
abcutils.plot.fix_xticks_timeseries(ax,
                                    format="%b %Y",
                                    criteria=lambda x: x.day == 1 and x.month % 2 == 0)

ax.set_ylabel(ax.get_ylabel().replace('\n', ' '))
ax.get_lines()[0].set_label("$SMA_{short}$")
ax.get_lines()[1].set_label("$SMA_{long}$")

# Draw the legend
legend_handlers = [
    matplotlib.lines.Line2D([0], [0], color='C1', lw=4),
    matplotlib.lines.Line2D([0], [0], color='C2', lw=4),
#   matplotlib.lines.Line2D([0], [0], color='black', linestyle='--', lw=2),
    matplotlib.patches.Patch(facecolor=NEG_CORRELATION_COLOR),
    matplotlib.patches.Patch(facecolor=POS_CORRELATION_COLOR),
]
legend_labels = [
    "${SMA}_{short}$",
    "${SMA}_{long}$",
    "Correlation < 0",
    "Correlation > 0"
]
ax.legend(legend_handlers, legend_labels, labelspacing=0, loc="lower right", framealpha=1.0)

#ax.legend(loc='lower right', bbox_to_anchor=(1.01, -0.04))

output_file = "figs/%s-bimodal-%s.pdf" % (test_platform.split('@', 1)[0], interesting_metric.replace('_', ''))
print("Saving to", output_file)
ax.get_figure().savefig(output_file, bbox_inches='tight')

matplotlib.rcParams['font.size'] = tmp
del tmp