# Segments and Centroids

This notebook creates a graphic to explain how _divergence regions_ and _trend regions_ are defined.

In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.  We are now caching the processed CSV in HDF5 format to speed up initial data ingest at the beginning of each analysis.  Delete the `CACHE_FILE` to re-generate this cache (e.g., when the contents of the CSV are updated).

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Demonstrate a Single Test Platform

Look at one combination of (compute system, file system, benchmark) to show what this UMAMI analysis can do.

### Define Input Parameters

In [None]:
TEST_PLATFORM = 'scratch2@edison'
# TEST_PLATFORM = 'cscratch@cori-knl'
# TEST_PLATFORM = 'cscratch@cori-haswell'
# TEST_PLATFORM = 'mira-fs1@mira'

BENCHMARK_ID = 'ior_fpp_write'
# BENCHMARK_ID = 'ior_fpp_read'
# BENCHMARK_ID = 'dbscan_read_shared_read'
# BENCHMARK_ID = 'vpicio_uni_shared_write'
# BENCHMARK_ID = 'ior_shared_write'
# BENCHMARK_ID = 'ior_shared_read'
# BENCHMARK_ID = 'hacc_io_read_fpp_read'
# BENCHMARK_ID = 'hacc_io_write_fpp_write'

XLIMS = (long(time.mktime(datetime.datetime(2017, 8, 15).timetuple())),
         long(time.mktime(datetime.datetime(2017, 12, 15).timetuple())))

plot_metric = 'darshan_normalized_perf_by_max'

example_df = filtered_df.groupby(by=['_test_platform', '_benchmark_id']).get_group((TEST_PLATFORM, BENCHMARK_ID)).copy()

print "test_platform =", TEST_PLATFORM
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(BENCHMARK_ID, BENCHMARK_ID)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", abcutils.sc18paper.DATE_START.isoformat()
print "date_end =", abcutils.sc18paper.DATE_END.isoformat()

In [None]:
# Metrics to include in UMAMI renderings and analysis.  Anything that
# _might_ affect performance should be included here.
umami_rows = [
    'darshan_normalized_perf_by_max',
    'coverage_factor_bw',
#   'coverage_factor_nodehrs',
    'coverage_factor_opens',
    'coverage_factor_stats',
    'coverage_factor_ops',
    'fs_ave_mds_cpu',
#   'fs_tot_metadata_ops',
    'fs_ave_oss_cpu',
#   'fs_tot_open_ops',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_oss_count',
#   'jobsdb_concurrent_nodes',
    'topology_job_max_radius',
]

## Region-defined Correlation

In [None]:
# Width of simple moving average (SMA) short/long windows
SHORT_WINDOW = pandas.Timedelta(days=14)
LONG_WINDOW = pandas.Timedelta(days=49)

print "Short window will average over %s at a time" % SHORT_WINDOW
print "Long window will average over %s at a time" % LONG_WINDOW

## Calculate intercepts and centroids from SMAs

* **Intercepts** are the place where two SMAs cross each other
* **Divergence regions** are the data bounded by two intercepts
* **Centroids** are the centermost data point in a divergence region

With this nomenclature, it is possible to also define **trend regions** which are bounded by two centroids.  These regions capture the transition between two divergence regions.

In [None]:
sma_short = abcutils.features.calculate_sma(example_df,
                                            '_datetime_start',
                                            plot_metric,
                                            window=SHORT_WINDOW)
sma_long = abcutils.features.calculate_sma(example_df,
                                           '_datetime_start',
                                           plot_metric,
                                           window=LONG_WINDOW)

In [None]:
sma_intercepts = abcutils.features.find_sma_intercepts(sma_short,
                                                        sma_long,
                                                        example_df['_datetime_start'])
sma_intercepts

In [None]:
sma_centroids = abcutils.features.find_sma_centroids(example_df,
                                                     sma_short,
                                                     sma_long,
                                                     sma_intercepts,
                                                     x_column='_datetime_start',
                                                     min_width=None)

### Visualize intercepts and centroids
Plot the SMAs, divergence regions, and centroids as a visual sanity check.

In [None]:
print "First print divergence regions"
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts,
                                regioncolors=["#0000001A", "#FFFFFF00"],
                                method='value')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)

ymin, ymax = ax.get_ylim()
for x in sma_intercepts['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black', linestyle='--', linewidth=1)

#for patch in ax.patches:
#   if patch.get_alpha() == 0.5:
#       patch.set_alpha(0.9)

ax.grid(False)

ax.set_ylim(0, 1)
ax.set_xlim(XLIMS)
ax.get_figure().set_size_inches(8, 3)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")

In [None]:
print """
Plot the following:
(1) raw data (blue bars)
(2) sma_short (thick line #1)
(3) sma_long (thick line #2)
(4) intercept regions (shaded grey boxes)
(5) centroid boundaries (black lines) -- these should bisect grey boxes
"""
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts,
                                method='value')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)

# Turn off raw data
for patch in ax.patches:
    if patch.get_alpha() == 0.5:
        patch.set_visible(False)

# Add boundaries
ymin, ymax = ax.get_ylim()
for x in sma_intercepts['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black', linestyle='--', linewidth=1)
        
# Add centroids
ymin, ymax = ax.get_ylim()
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='red', linestyle='--')

ax.set_ylim(0, 1)
ax.set_xlim(XLIMS)
ax.get_figure().set_size_inches(8, 3)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")

### Filter centroids

We only care about centroids that show a significant swing in performance.

In [None]:
ax = abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_centroids,
                                method='value')
xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ymin, ymax = ax.set_ylim(0, 1)
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='red', linestyle='--')

# Turn off raw data
for patch in ax.patches:
    if patch.get_alpha() == 0.5:
        patch.set_visible(False)

ax.set_ylim(0, 1)
ax.set_xlim(XLIMS)
ax.get_figure().set_size_inches(8, 3)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")

## Plot for paper

In [None]:
titleopts = {
    "x": 0.02,
    "y": 0.05,
    "backgroundcolor": "#FFFFFFAA",
    "ha": "left",
    "fontsize": 16,
}

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=3, ncols=1, figsize=(8, 6), sharex=True)

################################################################################
# Plot #1
ax = axes[0]
abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_intercepts,
                                regioncolors=["#FFFFFF00", "#0000001A"],
                                ax=ax,
                                method='value')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)

ymin, ymax = ax.get_ylim()
for x in sma_intercepts['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black', linestyle='--')

#for patch in ax.patches:
#   if patch.get_alpha() == 0.5:
#       patch.set_alpha(0.9)

# Turn off raw data
for patch in ax.patches:
    if patch.get_alpha() == 0.5:
        patch.set_visible(False)

ax.grid(False)

ax.set_ylim(0, 1)
ax.set_xlim(XLIMS)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")
yticklabels = ["%.1f" % lab for lab in ax.get_yticks()]
ax.set_ylabel("")
ax.set_yticklabels(yticklabels)
ax.set_title("(a) Divergence Regions", **titleopts)



################################################################################
# Plot #2
ax = axes[1]

abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=None,
                                long_window=None,
                                sma_intercepts=sma_intercepts,
                                regioncolors=["#FFFFFF00", "#0000001A"],
                                ax=ax,
                                method='value')

xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ax.grid(False)

# Add boundaries
ymin, ymax = ax.get_ylim()
for x in sma_intercepts['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='black', linestyle='--')
        
# Add centroids
ymin, ymax = ax.get_ylim()
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='red', linestyle='--')

ax.set_ylim(0, 1)
ax.set_xlim(XLIMS)
ax.set_yticklabels(yticklabels[0:-1])
ax.set_ylabel("")
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")
ax.set_title("(b)", **titleopts)




################################################################################
# plot #3
ax = axes[2]
abcutils.plot.sma_overlaps(dataframe=example_df,
                                plot_metric=plot_metric,
                                short_window=SHORT_WINDOW,
                                long_window=LONG_WINDOW,
                                sma_intercepts=sma_centroids,
                                regioncolors=["#0000001A", "#FFFFFF00"],
                                ax=ax,
                                method='value')
xmin, xmax = ax.get_xlim()
ax.set_xlim(xmin, xmax)
ymin, ymax = ax.set_ylim(0, 1)
for x in sma_centroids['_datetime_start']:
    ax.plot([abcutils.core.pd2epoch(x), abcutils.core.pd2epoch(x)], [ymin, ymax], color='red', linestyle='--')

# Turn off raw data
for patch in ax.patches:
    if patch.get_alpha() == 0.5:
        patch.set_visible(False)

ax.grid(False)
ax.set_ylim(0, 1)
ax.set_yticklabels(yticklabels[0:-1])
ax.set_xlim(XLIMS)
ax.set_ylabel("")
ax.set_title("(c) Trend Regions", **titleopts)

ax.get_lines()[0].set_label("$SMA_{%d}$" % SHORT_WINDOW.days)
ax.get_lines()[1].set_label("$SMA_{%d}$" % LONG_WINDOW.days)
ax.legend(loc='lower right',
          bbox_to_anchor=(1.01, -0.04),
          ncol=1,)


fig.text(0.04, 0.5,
         "Fraction Peak Performance",
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=16)


fig.subplots_adjust(hspace=0.0, wspace=0.05)
abcutils.plot.fix_xticks_timeseries(ax, format="%b %d\n%Y", rotation=0, ha="center")

fixed_labels = []
for xticklabel in ax.get_xticklabels():
    fixed_labels.append(xticklabel.get_text().replace('0', '', 1))
ax.set_xticklabels(fixed_labels)
fig.savefig('figs/segment-explain.pdf', bbox_inches='tight')