In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils
import matplotlib
matplotlib.rcParams.update({'font.size': 16})

## Global Analysis Constants

## Load and Synthesize Data from CSV

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

In [None]:
filtered_df.describe().T

## Basic Statistics

In [None]:
TEST_PLATFORMS = sorted(list(filtered_df['_test_platform'].unique()))
print "Test Platforms:\n  " + "\n  ".join(TEST_PLATFORMS)

BENCHMARK_IDS = sorted(list(filtered_df['_benchmark_id'].unique()))
print "\nBenchmarks:\n  " + "\n  ".join(BENCHMARK_IDS)

In [None]:
total_days = (abcutils.sc18paper.DATE_END - abcutils.sc18paper.DATE_START).total_seconds() / 86400
print "Total days: %.1f" % total_days

total_logs = filtered_df['darshan_agg_perf_by_slowest_posix'].count()
print "Total darshan logs: %d" % total_logs

expected_logs = (total_days * len(TEST_PLATFORMS) * len(BENCHMARK_IDS))
percent_found = 100.0 * total_logs / expected_logs
print "Percent benchmarks run: %.1f%%" % (percent_found)
print "Percent benchmarks missing: %.1f%%" % (100 - percent_found)

grouped_df = filtered_df.groupby(['_test_platform', '_benchmark_id'])
grouped_df['darshan_agg_perf_by_slowest_posix'].describe()['count'] / total_days

In [None]:
grouped_df['_datetime_start'].describe()

## Boxplots

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
#   'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'mira-fs1@mira'
]

In [None]:
boxplot_settings = {
    'fontsize': 16,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction Peak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 16},
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 16}]
    },
}

In [None]:
NUM_ROWS = 2
NUM_COLS = len(TEST_PLATFORMS) / 2

assert NUM_ROWS * NUM_COLS == len(TEST_PLATFORMS)

grouped_df = filtered_df.groupby('_test_platform')

fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(4*NUM_COLS,3*NUM_ROWS)

### Draw subplots that contain data
for index, fs in enumerate(TEST_PLATFORMS):
    irow = index / NUM_COLS
    icol = index % NUM_COLS
    ax = axes[irow, icol]
    abcutils.plot.grouped_boxplot(grouped_df.get_group(fs),
                                  'darshan_normalized_perf_by_max',
                                  ax=ax,
                                  fontsize=16)
    title = ax.set_title(fs, **(boxplot_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})
#   ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

### Set global figure labels 
fig.suptitle("")
fig.text(0.02, 0.5,
         boxplot_settings['darshan_normalized_perf_by_max']['ylabel'],
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=boxplot_settings['fontsize'])
fig.subplots_adjust(hspace=0.05, wspace=0.05)
fig.savefig('figs/summary_boxplots.pdf', bbox_inches='tight')

## Histogram of Coverage Factor

In [None]:
histogram_settings = {
    'fontsize': 20,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction of\nPeak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.90, 'horizontalalignment': 'left', 'fontsize': 14},
            {'x': 0.04, 'y': 0.90, 'horizontalalignment': 'left', 'fontsize': 14}]
    },
}

In [None]:
NUM_ROWS = 1
NUM_COLS = len(TEST_PLATFORMS)

grouped_df = filtered_df.groupby('_test_platform')

fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(20,4)

### Draw subplots that contain data
for index, fs in enumerate(TEST_PLATFORMS):
    irow = index / NUM_COLS
    ax = axes[index]
    
    grouped_df.get_group(fs)

    y1 = grouped_df.get_group(fs)['coverage_factor_bw'].dropna()
    y2 = grouped_df.get_group(fs)['coverage_factor_nodehrs'].dropna()
    common_opts = {
        "width": 1.0/15.0,
        "bins": numpy.linspace(0.0, 1.0, 15),
        "alpha": 0.75,
        "linewidth": 3.0,
    #   "zorder": 9,
    }

    for y, label in [(y1, 'Coverage Factor (BW)')]: #, (y2, 'Coverage Factor (NodeHrs)')]:
        ax.hist(y, label=label, **common_opts)

    ax.set_title(fs, fontsize=20)
    ax.set_xlabel("Coverage Factor", fontsize=16)
    ax.set_ylabel("Frequency", fontsize=16)
#   ax.legend(fontsize=12)
    ax.yaxis.grid()
#   ax.set_yscale("log")
#   ax.set_ylim([1, 1e4])
    ax.xaxis.set_tick_params(labelsize=14)
    ax.yaxis.set_tick_params(labelsize=14)
    ax.label_outer()

    title = ax.set_title(fs, **(histogram_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Set global figure labels 
fig.suptitle("")
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Performance Evolution Over Time

In [None]:
print "Valid benchmark_id values:\n"
print "\n".join(filtered_df['_benchmark_id'].unique())

In [None]:
def draw_boxplot_timeseries(df, date_start, date_end, benchmark_id, plot_metric):
    NUM_ROWS = len(TEST_PLATFORMS)
    NUM_COLS = 1
    date_delta = datetime.timedelta(days=7)
    fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                           ncols=NUM_COLS,
                                           sharex=True)
    fig.set_size_inches(16, 2.5*NUM_ROWS)
    
    xlabel = "Week in 2017"
    ylabel = "%s\n(%s)" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))

    for index, test_platform in enumerate(TEST_PLATFORMS):
        df_filter = ((df['_test_platform'] == test_platform) &
                    (df['_benchmark_id'] == benchmark_id))

        ax = axes[index]
        ax.set_ylabel(ylabel)

        abcutils.plot.timeseries_boxplot(df[df_filter], plot_metric, date_start, date_end, date_delta, ax=ax)

        title = ax.set_title(test_platform, **({'x': 0.01, 'y': 0.04, 'horizontalalignment': 'left', 'fontsize': 14}))
        title.set_bbox({'color': 'white', 'alpha': 0.5})

    ### Set global figure labels 
    axes[-1].set_xlabel(xlabel)
    fig.suptitle("")
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
#   output_file = "%s.png" % benchmark_id
#   fig.savefig(output_file, bbox_inches="tight")
#   print "Saved to %s" % output_file

In [None]:
date_start = datetime.datetime(2017, 2, 1)
date_end = datetime.datetime(2018, 2, 1)

plot_metric = 'darshan_normalized_perf_by_max'
# plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
benchmark_id = 'ior_shared_read'

for benchmark_id in BENCHMARK_IDS:
    draw_boxplot_timeseries(filtered_df, date_start, date_end, benchmark_id, plot_metric)