In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2017-12-30.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2017-12-31.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

## Selecting important metrics

While we can go at this correlation analysis in an unsupervised way, in practice, there are just too many metrics to walk through manually, and the level of redundancy across many metrics makes them unhelpful for statistical analysis anyway.  Although we are definitely biasing our results by choosing only a few metrics of interest, we can justify this by bootstrapping off of the findings of the PDSW'17 paper and say that we previously showed which metrics are most interesting.

In [None]:
# for x in sorted(df.columns):
#     print x

In [None]:
INTERESTING_METRICS = [
    '_benchmark_id',
    '_datetime_end',
    '_datetime_start',
    '_file_system',
    '_subsystem',
    '_system',
    '_test_platform',
    'coverage_factor_bw',
    'coverage_factor_read_bw',
    'coverage_factor_write_bw',
    'darshan_agg_perf_by_slowest_posix_gibs',
    'darshan_fpp_job?',
    'darshan_jobid',
    'darshan_normalized_perf_by_max',
    'darshan_write_job?',
    'fs_ave_mds_cpu',
    'fs_ave_oss_cpu',
    'fs_frac_missing',
    'fs_max_mds_cpu',
    'fs_max_oss_cpu',
    'fs_max_gibs_read_per_sec',
    'fs_max_gibs_written_per_sec',
    'fs_tot_openclose_ops',
    'fs_tot_metadata_ops',
    'fs_tot_readdir_ops',
    'fs_tot_unlink_ops',
    'fs_tot_getattr_ops',
    'fshealth_ost_avg_full_pct',
    'fshealth_ost_most_full_pct',
    'fshealth_ost_overloaded_pct',
    'jobsdb_concurrent_jobs',
    'topology_job_avg_radius',
    'topology_job_max_radius',
]

# Experimenting with Feature Detection

In [None]:
test_platform = 'mira-fs1@mira'
benchmark_id = 'ior_fpp_write'
plot_metric = 'darshan_normalized_perf_by_max'
date_start = datetime.datetime(2017, 2, 14)
date_end = date_start + datetime.timedelta(days=365)
min_streak = 3 # in days by default
group_by = ['_test_platform', '_benchmark_id']

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

print "test_platform =", test_platform
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', plot_metric)
print "date_start =", date_start.isoformat()
print "date_end =", date_end.isoformat()
print "Minimum days to count as a streak =", min_streak

First we plot the raw data for the (`test_platform`, `benchmark_id`) of interest:

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, sharex=True)
fig.set_size_inches(20,8)

x_raw = filtered_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_raw = filtered_df['darshan_normalized_perf_by_max']

ax = axes[0]
ax.grid()
ax.set_ylim(0, 1)
ax.plot(x_raw,
        y_raw,
        linestyle='-',
        marker='.')
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', 'darshan_normalized_perf_by_max'))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d") for x in ax.get_xticks()])
ax.set_title("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))

abcutils.plot.timeseries_boxplot(filtered_df, plot_metric, date_start, date_end, ax=axes[1])
axes[1].set_ylabel(abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', 'darshan_normalized_perf_by_max'))

pass

## Identifying Streaks

Now we find cases where performance monotonically increased or decreased over the course of several consecutive days.  In the following graph, streaks of **decreasing** performance are red and streaks of **increasing** performance are green.  Gaps in these red/green lines are regions where data was not streaking up or down.

In [None]:
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(20,4)

ax = abcutils.plot.timeseries_boxplot(filtered_df, plot_metric, date_start, date_end, ax=ax)

streaks = abcutils.features.find_streaks_df(filtered_df, plot_metric, min_streak=min_streak)
abcutils.plot.timeseries_streaks(filtered_df, streaks, ax)

xlabel = "Week"
ylabel = "%s\n(%s)" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                       abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_ylim(0,1)
ax.set_title("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))

pass

## Sliding window slopes

We can also calculate the slope of all data points within a fixed-width window that slides by some smaller delta.  Filtering data points that fall within windows that demonstrate a dramatic slope may be another way to identify periods of time that are interesting.

In [None]:
window_width = datetime.timedelta(days=7)
window_slide = datetime.timedelta(days=1)
print "Window width is %s" % window_width
print "Window slides by %s" % window_slide

fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(20,4)
ax.set_ylim(0, 1)
ax.grid(True)

streaks = abcutils.features.sliding_window_slopes(filtered_df, plot_metric, date_start, date_end, width=datetime.timedelta(days=7), delta=datetime.timedelta(days=1))
abcutils.plot.timeseries_manylines(streaks, colorfunc=lambda x: 'green' if x[1][-1] > x[1][0] else 'red', ax=ax, linewidth=2)

date = date_start
xticks = []
xticklabels = []
while date + window_width < date_end:
    xticks.append(time.mktime(date.timetuple()))
    xticklabels.append(time.strftime("%Y-%m-%d"))
    date += window_width
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels, rotation=90)
ax.set_title("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))
pass

## Summary of Feature Detection Methods

Now plot all of these feature detect approaches in a single pane to visually identify periods of time when data is consistently anomalous.

In [None]:
def plot_feature_detections(dataframe, test_platform, benchmark_id, plot_metric, min_streak=5):
    try:
        filtered_dataframe = dataframe.groupby(by=group_by).get_group((test_platform, benchmark_id))
    except KeyError:
        return None

    NROWS = 4
    fig, axes = matplotlib.pyplot.subplots(nrows=NROWS, ncols=1, sharex=True)
    fig.set_size_inches(20,4 * NROWS)
    xlabel = "Week"
    ylabel = "%s\n(%s)" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))

    # Row #1
    ax = axes[0]
    ax.plot(filtered_dataframe['_datetime_start'].apply(lambda x: time.mktime(x.timetuple())),
            filtered_dataframe['darshan_normalized_perf_by_max'],
            linestyle='-',
            marker='.')
    ax.set_title("Raw Data")

    # Row #2
    ax = axes[1]
    abcutils.plot.timeseries_boxplot(filtered_dataframe, plot_metric, date_start, date_end, ax=ax)
    ax.set_title("Distribution over time")

    # Row #3
    ax = axes[2]
    streaks = abcutils.features.find_streaks_df(filtered_dataframe, plot_metric, min_streak=min_streak)
    abcutils.plot.timeseries_streaks(filtered_dataframe, streaks, ax=ax)
    ax.set_title("Monotonic streaks (%d or more consecutive days)" % min_streak)

    # Row #4
    ax = axes[3]
    width = datetime.timedelta(days=7)
    delta = datetime.timedelta(days=1)
    streaks = abcutils.features.sliding_window_slopes(filtered_dataframe,
                                                      plot_metric,
                                                      date_start,
                                                      date_end,
                                                      width=width,
                                                      delta=delta)
    abcutils.plot.timeseries_manylines(streaks,
                                       colorfunc=lambda x: 'green' if x[1][-1] > x[1][0] else 'red',
                                       ax=ax,
                                       linewidth=2)
    ax.set_title("Sliding-window slope (%d-day windows sliding %d days at a time)" % (width.days, delta.days))

    # Set ticks and labels
    date = date_start
    xticks = []
    xticklabels = []
    while date + window_width < date_end:
        xticks.append(time.mktime(date.timetuple()))
        xticklabels.append(date.strftime("%Y-%m-%d"))
        date += window_width
    ax.set_xlabel(xlabel)

    # Common settings on all axes
    for ax in axes:
        ax.set_ylim(0, 1.2)
        ax.xaxis.grid(True)
        ax.yaxis.grid(True)
        ax.set_ylabel(ylabel)
        ax.set_xticks(xticks)
        ax.set_xticklabels(xticklabels, rotation=90)
        ax.set_title(ax.get_title(), **{'x': 0.01, 'y': 0.02, 'horizontalalignment': 'left'})

    fig.suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                               test_platform))
    fig.tight_layout(rect=[0, 0, 1, 0.98])
    
    return axes

In [None]:
# Global plot parameters
plot_metric = 'darshan_normalized_perf_by_max'
date_start = datetime.datetime(2017, 2, 14)
date_end = date_start + datetime.timedelta(days=365)
min_streak = 5 # in days by default
group_by = ['_test_platform', '_benchmark_id']

# Determine which plots to generate
test_platforms = sorted(df['_test_platform'].unique())
# benchmark_ids = sorted(df['_benchmark_id'].unique())
benchmark_ids = ['ior_shared_write']

# Generate plots
for test_platform in test_platforms:
    for benchmark_id in benchmark_ids:
        axes = plot_feature_detections(df, test_platform, benchmark_id, plot_metric, min_streak)
        axes[0].get_figure()

## Correlating during streaks

Now that we have a few ways to classify benchmark data points, we can look for interesting correlations among the different classifications.  We first apply a binary classification where each benchmark measurement is a member of a streak or it is not.  Then we correlate with performance in the case where performance **was** on a streak and when it **was not** on a streak (it was bouncing up and down instead).

In the clustered bar graphs, black and yellow just visually separate different clusters.  Red, green, and blue outlines indicate the confidence of the correlation coefficient plotted; red is low confidence, green is medium, and blue is high.

In [None]:
grouped_df = df.groupby(by=['_test_platform', '_benchmark_id'])
streak_indices = [False] * len(df.index)

for (test_platform, benchmark_id), df_group in grouped_df:
    streaks = abcutils.features.find_streaks_df(df_group, 'darshan_normalized_perf_by_max', min_streak=5)
    for streak in streaks:
        for index in streak[0]:
            streak_indices[index] = True

# Convert to array so we can do `streak_indices == True`
streak_indices = numpy.array(streak_indices)

In [None]:
group_by = ['_test_platform', '_benchmark_id']

correlation_dfs = {
    True: None,
    False: None
}
correlation_dfs_labels = {
    True: "during streaks",
    False: "not during streaks"
}
for fs in TEST_PLATFORMS:
    for streaking in sorted(correlation_dfs.keys()):
        df_filter = (df['_test_platform'] == fs)
        df_filter &= (streak_indices == streaking)
        df_to_correlate = df[df_filter][INTERESTING_METRICS]
        if len(df_to_correlate) == 0:
            continue
        
        # Calculate the correlation data for this vector
        correlation = abcutils.correlation.calc_correlation_vector(
            df_to_correlate,
            correlate_with='darshan_normalized_perf_by_max')

        # rename the columns in this vector to include the file system name
        new_cols = {}
        for index, col_name in enumerate(correlation.columns):
            new_cols[col_name] = "%s %s" % (fs, col_name)
        correlation.rename(columns=new_cols, inplace=True)

        # join the vector to the previous vectors' dataframe
        if correlation_dfs[streaking] is None:
            correlation_dfs[streaking] = correlation
        else:
            correlation_dfs[streaking] = pandas.concat([correlation_dfs[streaking], correlation], axis='columns')

In [None]:
#for streaking in sorted(correlation_dfs.keys()):
#    correlations = correlation_dfs[streaking]
#    ax = abcutils.plot.correlation_vector_table(
#        correlations,
#        row_name_map=abcutils.CONFIG['metric_labels'])
#
#    # Set the table width larger if displaying lots of metrics
#    ax.get_figure().set_size_inches(20, 0.4 * len(correlations))
#    ax.set_title("Streaking" if streaking else "Not Streaking")

In [None]:
print """Each cluster of bars represents non-overlapping subsets of the benchmark data.
The bars are printed in the following order:
"""
print "\n".join([correlation_dfs_labels.get(x) for x in sorted(correlation_dfs_labels.keys())])

plot_metrics = correlation_dfs.itervalues().next().index
for test_platform in sorted(df['_test_platform'].unique()):
    ax = abcutils.plot.clustered_correlation_bars(correlation_dfs, plot_metrics, test_platform)
    ax.set_title(test_platform)

## Correlating only the top and bottom quartiles

Instead of correlating against areas where performance increased or decreased over a long period of time, we divide the benchmark measurements into three groups:

1. the bottom 25% worst measurements
2. the middle 50% of measurements (all measurements between the first and third quartile)
3. the top 25% best measurements

Then we do correlation analysis within each group to see if really bad performance shows correlation with metrics in a way that is not captured for the majority of jobs that had so-so performance.

Unlike the streak-based correlation, we lose all temporal information by classifying data this way.  However if the correlations are more compelling, it may suggest that long-term (temporally correlated) performance maladies are not the primary cause of extreme performance variation.

In [None]:
group_by = ['_test_platform', '_benchmark_id']
metric_distributions = df.groupby(by=group_by).describe()

correlation_dfs = {
    "25%": None,
    "50%": None,
    "75%": None
}
correlation_dfs_labels = {
    "25%": "bottom quartile of runs",
    "50%": "middle 50% of runs",
    "75%": "top quartile of runs"
}

for quartile in sorted(correlation_dfs.keys()):
    for fs in TEST_PLATFORMS:
        df_to_correlate = df[df['_test_platform'] == fs][INTERESTING_METRICS]

        if quartile in ['25%', '75%']:
            cutoff = metric_distributions.loc[fs, benchmark_id]['darshan_normalized_perf_by_max'][quartile]
            cutoff_filter = (df_to_correlate['darshan_normalized_perf_by_max'] < cutoff)
        elif quartile == '50%': # actually the IQR
            cutoff_low = metric_distributions.loc[fs, benchmark_id]['darshan_normalized_perf_by_max']['25%']
            cutoff_high = metric_distributions.loc[fs, benchmark_id]['darshan_normalized_perf_by_max']['75%']
            cutoff_filter = (df_to_correlate['darshan_normalized_perf_by_max'] > cutoff_low)
            cutoff_filter &= (df_to_correlate['darshan_normalized_perf_by_max'] > cutoff_high)
            
        # Calculate the correlation data for this vector
        correlation = abcutils.correlation.calc_correlation_vector(
            df_to_correlate[cutoff_filter],
            correlate_with='darshan_normalized_perf_by_max')

        # rename the columns in this vector to include the file system name
        new_cols = {}
        for index, col_name in enumerate(correlation.columns):
            new_cols[col_name] = "%s %s" % (fs, col_name)
        correlation.rename(columns=new_cols, inplace=True)

        # join the vector to the previous vectors' dataframe
        if correlations is None:
            correlation_dfs[quartile] = correlation
        else:
            correlation_dfs[quartile] = pandas.concat([correlation_dfs[quartile], correlation], axis='columns')

In [None]:
#for quartile in sorted(correlation_dfs.keys()):
#    correlations = correlation_dfs[quartile]
#    ax = abcutils.plot.correlation_vector_table(
#        correlations,
#        row_name_map=abcutils.CONFIG['metric_labels'])
#
#    # Set the table width larger if displaying lots of metrics
#    ax.get_figure().set_size_inches(20, 0.4 * len(correlations))
#    ax.set_title(quartile)

In [None]:
print """Each cluster of bars represents non-overlapping subsets of the benchmark data.
The bars are printed in the following order:
"""
print "\n".join([correlation_dfs_labels.get(x) for x in sorted(correlation_dfs_labels.keys())])

plot_metrics = correlation_dfs.itervalues().next().index
for test_platform in sorted(df['_test_platform'].unique()):
    ax = abcutils.plot.clustered_correlation_bars(correlation_dfs, plot_metrics, test_platform)
    ax.set_title(test_platform)