In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils

## Global Analysis Constants

In [None]:
TEST_PLATFORMS = [ 'scratch1@edison', 'scratch2@edison', 'scratch3@edison', 'cscratch@cori-knl', 'cscratch@cori-haswell', 'mira-fs1@mira' ]

## Load and Synthesize Data from CSV

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2017-12-30.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2017-12-31.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

## Correlation Table

First show the most compelling correlations across all data.  This will be messy because it includes all file systems and test conditions, so there are many uncontrolled variables represented.

In [None]:
pandas.options.display.max_rows = 40

correlation = abcutils.correlation.calc_correlation_vector(df, correlate_with='darshan_normalized_perf_by_max')

filtered_correlations = abcutils.apply_filters(correlation, [correlation['p-value'] < 1.0e-5], verbose=True)
filtered_correlations.sort_values('coefficient')

In [None]:
ax = abcutils.plot.correlation_vector_table(filtered_correlations, row_name_map=abcutils.CONFIG['metric_labels'])
ax.get_figure().set_size_inches(4, 0.4 * len(filtered_correlations))

Now draw the entire correlation table split out by _test platform_--a combination of the file system being tested and the node configuration being used to test it.

In [None]:
correlations = None
for fs in TEST_PLATFORMS:
    # generate a single file system's correlation vector
    correlation = abcutils.correlation.calc_correlation_vector(
        df[df['_test_platform'] == fs],
        correlate_with='darshan_normalized_perf_by_max')
            
    # rename the columns in this vector to include the file system name
    new_cols = {}
    for index, col_name in enumerate(correlation.columns):
        new_cols[col_name] = "%s %s" % (fs, col_name)
    correlation.rename(columns=new_cols, inplace=True)
    
    # join the vector to the previous vectors' dataframe
    if correlations is None:
        correlations = correlation
    else:
        correlations = pandas.concat([correlations, correlation], axis='columns')

In [None]:
# Construct filter to show any metric that registered a low p-value for _any_ file system
filters = None
for fs in TEST_PLATFORMS:
    subfilter = correlations['%s p-value' % fs] < 1.0e-5
    if filters is None:
        filters = subfilter
    else:
        filters |= subfilter

ax = abcutils.plot.correlation_vector_table(
    correlations[filters],
    row_name_map=abcutils.CONFIG['metric_labels'])

# Set the table width larger if displaying lots of metrics
ax.get_figure().set_size_inches(20, 0.4 * len(correlations[filters]))

## Boxplots

In [None]:
boxplot_settings = {
    'fontsize': 20,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction of\nPeak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14},
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14}]
    },
}

In [None]:
NUM_ROWS = 1
NUM_COLS = 5
fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(20,4)

SUBPLOT_ARRANGEMENT = {
    'scratch1@edison': axes[0],
    'scratch2@edison': axes[1],
    'scratch3@edison': axes[2],
    'cscratch@cori-knl': axes[3],
    'cscratch@cori-haswell': axes[4]
}

### Draw subplots that contain data
for index, fs in enumerate(sorted(SUBPLOT_ARRANGEMENT.keys())):
    irow = index / NUM_COLS
    ax = SUBPLOT_ARRANGEMENT[fs]
    abcutils.plot.grouped_boxplot(df[df["_test_platform"] == fs],
                                       'darshan_normalized_perf_by_max',
                                       ax=ax,
                                       fontsize=16)
    title = ax.set_title(fs, **(boxplot_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Set global figure labels 
fig.suptitle("")
fig.text(0.0, 0.5,
         boxplot_settings['darshan_normalized_perf_by_max']['ylabel'],
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=boxplot_settings['fontsize'])
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Histogram of Coverage Factor

In [None]:
histogram_settings = {
    'fontsize': 20,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction of\nPeak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.90, 'horizontalalignment': 'left', 'fontsize': 14},
            {'x': 0.04, 'y': 0.90, 'horizontalalignment': 'left', 'fontsize': 14}]
    },
}

In [None]:
NUM_ROWS = 1
NUM_COLS = len(TEST_PLATFORMS)
fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(20,4)

SUBPLOT_ARRANGEMENT = {
    'scratch1@edison': axes[0],
    'scratch2@edison': axes[1],
    'scratch3@edison': axes[2],
    'cscratch@cori-knl': axes[3],
    'cscratch@cori-haswell': axes[4],
    'mira-fs1@mira': axes[5],
}

### Draw subplots that contain data
for index, fs in enumerate(sorted(SUBPLOT_ARRANGEMENT.keys())):
    irow = index / NUM_COLS
    ax = SUBPLOT_ARRANGEMENT[fs]

    y1 = df[df['_test_platform'] == fs]['coverage_factor_bw'].dropna()
    y2 = df[df['_test_platform'] == fs]['coverage_factor_nodehrs'].dropna()
    common_opts = {
        "width": 1.0/15.0,
        "bins": numpy.linspace(0.0, 1.0, 15),
        "alpha": 0.75,
        "linewidth": 3.0,
    #   "zorder": 9,
    }

    for y, label in [(y1, 'Coverage Factor (BW)')]: #, (y2, 'Coverage Factor (NodeHrs)')]:
        ax.hist(y, label=label, **common_opts)

    ax.set_title(fs, fontsize=20)
    ax.set_xlabel("Coverage Factor", fontsize=16)
    ax.set_ylabel("Frequency", fontsize=16)
#   ax.legend(fontsize=12)
    ax.yaxis.grid()
#   ax.set_yscale("log")
#   ax.set_ylim([1, 1e4])
    ax.xaxis.set_tick_params(labelsize=14)
    ax.yaxis.set_tick_params(labelsize=14)
    ax.label_outer()

    title = ax.set_title(fs, **(histogram_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Set global figure labels 
fig.suptitle("")
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Performance Evolution Over Time

In [None]:
df['benchmark_id'] = df['darshan_app'] + "_"  + df['darshan_fpp_or_ssf_job'] + "_" + df['darshan_read_or_write_job']
df['darshan_agg_perf_by_slowest_posix_gibs'] = df['darshan_agg_perf_by_slowest_posix'] / 1024.0

print "Valid benchmark_id values:\n"
print "\n".join(df['benchmark_id'].unique())

In [None]:
def generate_boxplot_data(df, date_start, date_end, date_delta):
    def increment_month(date):
        now_month = date.month
        now_year = date.year
        next_month = now_month + 1 if now_month < 12 else 1
        next_year = now_year + 1 if now_month == 12 else now_year
        return date.replace(year=next_year, month=next_month)

    x = []
    x_labels = []
    y = []
    date = date_start

    while date < date_end:
    #    next_date = increment_month(date)
        next_date = date + datetime.timedelta(days=7)
        y.append(df[(df['_datetime_start'] >= date) & (df['_datetime_start'] < next_date)][plot_metric])
        x.append(time.mktime(date.timetuple()))
        x_labels.append(date.strftime("%b %d, %Y"))

        date = next_date
    return x, y, x_labels

In [None]:
def draw_boxplot_timeseries(df, date_start, date_end, benchmark_id, plot_metric):
    NUM_ROWS = len(TEST_PLATFORMS)
    NUM_COLS = 1
    fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                           ncols=NUM_COLS,
                                           sharex=True)

    fig.set_size_inches(16,20)
    
    plot_metric_labels = {
        'darshan_agg_perf_by_slowest_posix_gibs': 'Bandwidth (GiB/s)',
        'darshan_normalized_perf_by_max': 'fraction peak performance',
    }
    benchmark_id_labels = {
        'ior_shared_read': 'IOR shared-file read',
        'ior_shared_write': 'IOR shared-file write',
        'ior_fpp_read': 'IOR file-per-process read',
        'ior_fpp_write': 'IOR file-per-process write',
        'dbscan_read_shared_read': 'BD-CATS shared-file read',
        'vpicio_uni_shared_write': 'VPIC shared-file write',
        'hacc_io_read_fpp_read': 'HACC file-per-process read',
        'hacc_io_write_fpp_write': 'HACC file-per-process write',
    }
    date_delta = datetime.timedelta(days=7)
    xlabel = "Week in 2017"
    ylabel = "%s\n(%s)" % (benchmark_id_labels.get(benchmark_id, benchmark_id),
                           plot_metric_labels.get(plot_metric, plot_metric))

    boxplot_settings = {
        'boxprops': {'linewidth': 2},
        'medianprops': {'linewidth': 2},
        'whiskerprops': {'linewidth': 2},
        'capprops': {'linewidth': 2},
        'widths': date_delta.total_seconds() * 5 / 7,
        'whis': [5, 95],
        'showfliers': False,
    }

    for index, test_platform in enumerate(TEST_PLATFORMS):
        df_filter = ((df['_test_platform'] == test_platform) &
                    (df['benchmark_id'] == benchmark_id))

        ax = axes[index]

        x, y, x_labels = generate_boxplot_data(df[df_filter],
                                               date_start,
                                               date_end,
                                               date_delta)

        ax.boxplot(y, positions=x, **boxplot_settings)


        ax.set_title(test_platform)
        fig.suptitle("")
        ax.set_xlabel("")
        ax.set_ylabel(ylabel)
        ax.set_ylim(0)
        ax.yaxis.grid(True)

        title = ax.set_title(test_platform, **({'x': 0.01, 'y': 0.04, 'horizontalalignment': 'left', 'fontsize': 14}))
        title.set_bbox({'color': 'white', 'alpha': 0.5})

    ### Set global figure labels 
    axes[-1].set_xlabel(xlabel)
    axes[-1].set_xticklabels(x_labels, rotation=90)
    fig.suptitle("")
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    output_file = "%s.png" % benchmark_id
    fig.savefig(output_file, bbox_inches="tight")
    print "Saved to %s" % output_file



In [None]:
date_start = datetime.datetime(2017, 2, 1)
date_end = datetime.datetime(2018, 2, 1)

# plot_metric = 'darshan_normalized_perf_by_max'
plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
benchmark_id = 'ior_shared_read'

for benchmark_id in 'dbscan_read_shared_read', 'vpicio_uni_shared_write', 'hacc_io_read_fpp_read', 'hacc_io_write_fpp_write', 'ior_shared_read', 'ior_shared_write', 'ior_fpp_read', 'ior_fpp_write':
    draw_boxplot_timeseries(df, date_start, date_end, benchmark_id, plot_metric)

## Umami Diagrams

In [None]:
import time
import datetime
import tokio.tools.umami

In [None]:
umami_diagrams = [
    # The "I/O contention" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch2',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 14),
            df['_datetime_start'] < datetime.datetime(2017, 3, 3, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'coverage_factor_nodehrs',
            'fs_ave_mds_cpu',
            'fs_tot_open_ops',
            'topology_job_max_radius',
        ],
    },
    # The "storage capacity" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch3',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 21, 0, 0, 0),
            df['_datetime_start'] < datetime.datetime(2017, 3, 15, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'fs_max_oss_cpu',
            'fshealth_ost_most_full_pct',
        ],
    },
]

pandas.options.display.max_rows = 11
filtered_df = abcutils.apply_filters(df, umami_diagrams[0]['filters'], verbose=True)
filtered_df.head().T

In [None]:
for umami_diagram in umami_diagrams:
    filtered_df = abcutils.apply_filters(df, umami_diagram['filters'], verbose=True)
    fig = abcutils.plot.generate_umami(filtered_df, umami_diagram['rows'])