In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 18})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
TEST_PLATFORM = 'mira-fs1@mira'

BENCHMARK_ID = 'hacc_io_write_fpp_write'

plot_metric = 'darshan_normalized_perf_by_max'
delta = datetime.timedelta(days=1).total_seconds()

DRAW_REGION_ILOC = -4 # just for highlighting a specific region in the paper

example_df = filtered_df.groupby(by=['_test_platform', '_benchmark_id']).get_group((TEST_PLATFORM, BENCHMARK_ID)).copy()
example_df = filtered_df.groupby(by=['_test_platform']).get_group((TEST_PLATFORM)).copy()

print("test_platform =", TEST_PLATFORM)
print("benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID))
print("plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
print("date_start =", abcutils.sc18paper.DATE_START.isoformat())
print("date_end =", abcutils.sc18paper.DATE_END.isoformat())
print("observations =", len(example_df))

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_normalized_perf_by_max',
    'contention_bw',
#   'coverage_factor_nodehrs',
    'contention_opens',
    'contention_stats',
    'contention_ops',
    'fs_ave_mds_cpu',
#   'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
#   'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
#   'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Region-defined Correlation

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=49)

print("Short window will average over %s at a time" % SHORT_WINDOW)
print("Long window will average over %s at a time" % LONG_WINDOW)

## Calculate intercepts from SMAs

In [None]:
sma_short = abcutils.features.calculate_sma(example_df,
                                            '_datetime_start',
                                            plot_metric,
                                            window=SHORT_WINDOW)
sma_long = pandas.Series([example_df[plot_metric].mean()] * len(sma_short),
                         index=sma_short.index,
                         name=sma_short.name)

In [None]:
sma_intercepts = abcutils.features.sma_intercepts(example_df,
                                                  plot_metric,
                                                  short_window=SHORT_WINDOW,
                                                  long_window=LONG_WINDOW)
sma_intercepts

### Filter intercepts

Note that we ultimately abandoned the delta-based filtering above.  It also doesn't make as much sense in the context of divergence regions, so just pass the regions though.

In [None]:
intercept_regions = list(abcutils.features.intercepts_to_region(example_df, sma_intercepts))
print("Using %d divergence regions"  % len(intercept_regions))

## Filter regions based on p-value

Regions that contain too few data points have very large p-values.  We discard those regions.

In [None]:
pcutoff = 1.0e-5
results = {
    'region_start': [],
    'region_end': [],
    'region_start_index': [],
    'region_end_index': [],
    'metric': [],
    'coeff': [],
    'pvalue': [],
    'region_points': []
}
identified_regions = []
not_pass_filter = 0

for region in intercept_regions: # centroid_regions:
    x = region[plot_metric].values
    base_nan_filter = numpy.isnan(x)
    title = "%s - %s (%d points)" % (
        region.iloc[0]['_datetime_start'],
        region.iloc[-1]['_datetime_start'],
        len(x[~base_nan_filter])
    )
    if len(x[~base_nan_filter]) < 3:
        # two points will create a correlation with p-value = 0
        print("skipping region:", title)
        not_pass_filter += 1
        continue

    print("new region:", title)
    identified = False
    for y_label in umami_rows: #example_df.columns: #umami_rows:
        if y_label == plot_metric:
            continue
        y = example_df.loc[region.index][y_label].values
        try:
            nan_filter = base_nan_filter | numpy.isnan(y)
        except TypeError:
            # non-numeric; pass
            continue
        this_x = x[~nan_filter]
        this_y = y[~nan_filter]
        if len(this_y) > 0:
            coeff, pval = scipy.stats.pearsonr(this_x, this_y)
            if pval < pcutoff and coeff < 0.9999:
                results['region_start'].append(region.iloc[0]['_datetime_start'])
                results['region_end'].append(region.iloc[-1]['_datetime_start'])
                results['region_start_index'].append(region.index[0])
                results['region_end_index'].append(region.index[-1])
                results['metric'].append(y_label)
                results['coeff'].append(coeff)
                results['pvalue'].append(pval)
                results['region_points'].append(len(x[~base_nan_filter]))
#               fig, ax = matplotlib.pyplot.subplots()
#               ax.scatter(this_x, this_y)
#               ax.set_xlabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
#               ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(y_label, y_label).replace(' (', '\n('))
#               ax.grid()
#               fit = scipy.stats.linregress(this_x, this_y)
#               ax.set_xlim(ax.get_xlim())
#               ax.plot(ax.get_xticks(),
#                       [fit.slope * xi + fit.intercept for xi in ax.get_xticks()],
#                       color='C2',
#                      linewidth=3)
                print("    Fit for", y_label, title)
                print("    Coefficient: %12.4f" % coeff)
                print("    p-value:     %12.4e" % pval)
#               print("    Slope:       %12.4f" % fit.slope)
#               print("    Rval:        %12.4e" % fit.rvalue)
                print()
                identified = True

    # Keep track of regions that have known root causes
    if identified:
        identified_regions.append(region)
    print()

print("Kept %d regions" % (len(intercept_regions) - not_pass_filter))
print("Classified %d regions" % len(identified_regions))
print("Discarded %d regions" % not_pass_filter)
    
results_df = pandas.DataFrame.from_dict(results)

In [None]:
results_df.sort_values('coeff')

In [None]:
for correlated_metric in results_df['metric'].unique():
    x = example_df[plot_metric].values
    y = example_df[correlated_metric].values
    nan_filter = (numpy.isnan(x) | numpy.isnan(y))
    x = x[~nan_filter]
    y = y[~nan_filter]
    coeff, pval = scipy.stats.pearsonr(x, y)
    print("Global correlation between %s and %s:\n  coeff:   %12.4f\n  p-value: %12.4e" % (
        plot_metric,
        correlated_metric,
        coeff,
        pval))

## Plot identified region(s)

### Build up the partitioning

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=pandas.Timedelta(days=999),
                                sma_intercepts=sma_intercepts[0:1],
                                method='value')
ymin, ymax = ax.set_ylim(0, 1)

# Recolor bars and remove patches
for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
    if bar.get_width() == 86400:
        bar.set_color("#b3d1e5")
        bar.set_alpha(1.0)
    else:
        bar.set_visible(False)

ax.get_figure().set_size_inches((8, 6))
ax.set_axisbelow(False)

# Draw the legend
legend_handlers = [
    matplotlib.lines.Line2D([0], [0], color='C1', lw=2),
    matplotlib.lines.Line2D([0], [0], color='C2', lw=2),
    (matplotlib.patches.Patch(facecolor='#91BCD9')),
]
legend_labels = [
    "${SMA}_{short}$",
    "Global Average",
    "Performance Measurement",
]
ax.legend(legend_handlers, legend_labels, loc="lower left")


ax.set_yticks(numpy.arange(0, 1.2, 0.25))

# Thicken up the lines
for line in ax.get_lines():
    line.set_linewidth(4.0)

ax.set_xlim(
    time.mktime(datetime.datetime(2017, 9, 1).timetuple()),
    time.mktime(datetime.datetime(2018, 2, 1).timetuple()))
abcutils.plot.fix_xticks_timeseries(ax, format="%b %Y")
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

ax.get_figure().savefig("figs/mira-regions-overview-0.pdf", bbox_inches='tight')

In [None]:
# Remove all bars
for bar in [rect for rect in ax.get_children() if isinstance(rect, matplotlib.patches.Rectangle)]:
    bar.set_visible(False)
ax.legend(legend_labels[:2], loc="lower left")
ax.get_figure().savefig("figs/mira-regions-overview-1.pdf", bbox_inches='tight')
ax.get_figure()

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts[0:1],
                                plotraw=False,
                                method='value')
ax.get_figure().set_size_inches((8, 6))
ymin, ymax = ax.set_ylim(0, 1)
ax.set_axisbelow(True)

legend_labels = [
    "${SMA}_{short}$",
    "${SMA}_{long}$",
]
ax.legend(legend_labels, loc="lower left")
ax.set_yticks(numpy.arange(0, 1.2, 0.25))

for line in ax.get_lines():
    line.set_linewidth(4.0)

ax.set_xlim(
    time.mktime(datetime.datetime(2017, 9, 1).timetuple()),
    time.mktime(datetime.datetime(2018, 2, 1).timetuple()))
abcutils.plot.fix_xticks_timeseries(ax, format="%b %Y")
ax.set_ylabel(ax.get_ylabel().replace("\n", " "))

ax.get_figure().savefig("figs/mira-regions-overview-2.pdf", bbox_inches='tight')

In [None]:
# Draw dotted lines denoting the boundaries between regions for clarity
ymin, ymax = ax.get_ylim()
for row in results_df.itertuples():
    xmin = abcutils.core.pd2epoch(row.region_start)
    xmax = abcutils.core.pd2epoch(row.region_end)
    color='black'
    ax.plot([xmin, xmin],
            [ymin, ymax],
            linestyle='--',
            linewidth=2,
            color=color)
    ax.plot([xmax, xmax],
            [ymin, ymax],
            linestyle='--',
            linewidth=2,
            color=color)

# Draw the legend
legend_handlers = [
    matplotlib.lines.Line2D([0], [0], color='C1', lw=2),
    matplotlib.lines.Line2D([0], [0], color='C2', lw=2),
    matplotlib.lines.Line2D([0], [0], color='black', linestyle='--', lw=2),
]
legend_labels = [
    "${SMA}_{short}$",
    "${SMA}_{long}$",
    "Crossover point",
]
ax.legend(legend_handlers,
          legend_labels,
          labelspacing=0,
          loc="lower left",
          facecolor='white',
          framealpha=1.0)

ax.get_figure().savefig("figs/mira-regions-overview-3.pdf", bbox_inches='tight')
ax.get_figure()

### Re-draw the final chart

In [None]:
def draw_figure6(*args, **kwargs):
    """Wrapper around abcutils.plot.sma_overlaps
    """
    ax = abcutils.plot.sma_overlaps(*args, **kwargs)
    ax.get_figure().set_size_inches((8, 6))

    # Thicken up the lines
    for line in ax.get_lines():
        line.set_linewidth(4.0)

    # Shade in the regions identified and highlight the region of interest in red
    ymin, ymax = ax.set_ylim(0, 1)
    for region in identified_regions:
        if DRAW_REGION_ILOC is not None \
        and region.index[0] == identified_regions[DRAW_REGION_ILOC].index[0]:
            abcutils.plot.draw_region(ax, region, facecolor='red', alpha=0.20)
        else:
            abcutils.plot.draw_region(ax, region)

    # Draw dotted lines denoting the boundaries between regions for clarity
    for row in results_df.itertuples():
        xmin = abcutils.core.pd2epoch(row.region_start)
        xmax = abcutils.core.pd2epoch(row.region_end)
        if DRAW_REGION_ILOC is not None \
        and row.Index == results_df.index[DRAW_REGION_ILOC]:
            color='red'
        else:
            color='black'
        ax.plot([xmin, xmin],
                [ymin, ymax],
                linestyle='--',
                linewidth=2,
                color=color)
        ax.plot([xmax, xmax],
                [ymin, ymax],
                linestyle='--',
                linewidth=2,
                color=color)

    ax.set_axisbelow(True)

    # Draw the legend
    legend_handlers = [
        matplotlib.lines.Line2D([0], [0], color='C1', lw=4),
        matplotlib.lines.Line2D([0], [0], color='C2', lw=4),
        matplotlib.lines.Line2D([0], [0], color='black', linestyle='--', lw=2),
        (matplotlib.patches.Patch(facecolor='#00000033', lw=2, linestyle='--', edgecolor='black'),
        matplotlib.patches.Patch(facecolor='#FFFFFF00', lw=2, linestyle='--', edgecolor='black')),
    ]
    legend_labels = [
        "${SMA}_{short}$",
        "${SMA}_{long}$",
        "Crossover point",
        "Divergence region"
    ]
    ax.legend(legend_handlers, legend_labels, labelspacing=0, loc="lower left", framealpha=1.0)
    ax.set_yticks(numpy.arange(0, 1.2, 0.25))

    ax.set_xlim(
        time.mktime(datetime.datetime(2017, 9, 1).timetuple()),
        time.mktime(datetime.datetime(2018, 2, 1).timetuple()))
    abcutils.plot.fix_xticks_timeseries(ax, format="%b %Y")
    ax.set_ylabel(ax.get_ylabel().replace("\n", " "))
    
    return ax

In [None]:
tmp = DRAW_REGION_ILOC
DRAW_REGION_ILOC = None
ax = draw_figure6(dataframe=example_df,
                  plot_metric=plot_metric,
                  short_window=SHORT_WINDOW,
                  long_window=LONG_WINDOW,
                  sma_intercepts=sma_intercepts[0:1],
                  plotraw=False,
                  linewidth=4.0)
DRAW_REGION_ILOC = tmp
del tmp

ax.get_figure().savefig("figs/mira-regions-overview.pdf", bbox_inches='tight')

In [None]:
ax = draw_figure6(dataframe=example_df,
                  plot_metric=plot_metric,
                  short_window=SHORT_WINDOW,
                  long_window=LONG_WINDOW,
                  sma_intercepts=sma_intercepts[0:1],
                  plotraw=False,
                  linewidth=4.0)

ax.get_figure().savefig("figs/mira-regions-overview-highlighted.pdf", bbox_inches='tight')

## Plot for paper

In [None]:
def plot_region_correlation(dataframe, region, plot_metric):
    fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 7))

    #region = results_df.iloc[0]
    region_df = dataframe.loc[region['region_start_index']:region['region_end_index']]

    # Plot #1
    ax = axes[0]
#   ax.set_title("(a)", x=0.07, y=0.05)

    abcutils.plot.sma_overlaps(dataframe=dataframe,
                               plot_metric=plot_metric,
                               short_window=SHORT_WINDOW,
                               long_window=LONG_WINDOW,
                               sma_intercepts=sma_intercepts[0:1],
                               plotraw=False,
                               ax=ax)        
    # Thicken up the lines
    for line in ax.get_lines():
        line.set_linewidth(4.0)

    # Draw the region of interest
    min_y, max_y = ax.set_ylim(0, 1)
    min_x = abcutils.core.pd2epoch(region['region_start'])
    max_x = abcutils.core.pd2epoch(region['region_end'])
    ax.add_patch(matplotlib.patches.Rectangle(xy=(min_x, min_y),
                 width=(max_x - min_x),
                 height=(max_y - min_y),
                 facecolor='red',
                 linewidth=0,
                 alpha=0.20,
                 zorder=0))
    ax.plot([min_x, min_x], [min_y, max_y], linestyle='--', color='red')
    ax.plot([max_x, max_x], [min_y, max_y], linestyle='--', color='red')

    ax.set_xlim(min_x - 86400 * 30, #(max_x - min_x)*2,
                max_x + 86400 * 30) #(max_x - min_x)*1)
    ax.set_ylabel(ax.get_ylabel().replace("\n", " ", 1))

    # Draw the legend
#   ax.get_lines()[0].set_label("$SMA_{%d}$" % SHORT_WINDOW.days)
#   ax.get_lines()[1].set_label("$SMA_{%d}$" % LONG_WINDOW.days)
    ax.get_lines()[0].set_label("$SMA_{short}$")
    ax.get_lines()[1].set_label("$SMA_{long}$")

    ax.legend(loc='lower right', bbox_to_anchor=(1.01, -0.04))

    # Clean up the x tick labeling
    abcutils.plot.fix_xticks_timeseries(ax,
                                        format="%b %d",
                                        criteria=lambda x: x.toordinal() % 14 == 0,
                                        rotation=0,
                                        ha='center')

    # Plot #2
    ax = axes[1]
#   ax.set_title("(b)", x=0.07, y=0.8)

    # Drop NaNs--they break scipy's Pearson correlation
    xval = region_df[plot_metric]
    yval = region_df[region['metric']]
    nan_filter = numpy.isnan(xval) | numpy.isnan(yval)
    xval = xval[~nan_filter]
    yval = yval[~nan_filter]
    fit = scipy.stats.linregress(xval, yval)
    
    colorvals = numpy.log(xval.div(yval))

#   ax.scatter(xval, yval, c=colorvals, cmap='magma_R', alpha=0.75, edgecolor='#000000FF')
    ax.scatter(xval, yval, c='C0')
    ax.set_xlabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
    ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(region['metric'], region['metric']).replace(' (', '\n('))
    ax.grid()
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)
    ax.set_yticks([0, .25, .5, .75, 1.])
    ax.set_xlim(ax.get_xlim())
    ax.plot(ax.get_xticks(),
            [fit.slope * xi + fit.intercept for xi in ax.get_xticks()],
            color='black',
            linestyle='--',
            linewidth=2)
    return axes

In [None]:
axes = plot_region_correlation(example_df, results_df.iloc[DRAW_REGION_ILOC], plot_metric)
axes[0].get_figure().savefig("figs/mira-correlation-region.pdf", bbox_inches='tight')