In [None]:
%matplotlib inline

In [None]:
import matplotlib
import pandas
import numpy
import scipy.stats
import abcutils

## Global Analysis Constants

In [None]:
FILE_SYSTEMS = [ 'scratch1', 'scratch2', 'scratch3', 'cscratch' ]

## Load and Synthesize Data from CSV

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2017-12-30.csv'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2017-12-31.csv')],
                   axis='rows')

## Correlation Table

In [None]:
pandas.options.display.max_rows = 20

correlation = abcutils.correlation.calc_correlation_vector(df, correlate_with='darshan_normalized_perf_by_max')

filtered_correlations = abcutils.apply_filters(correlation, [correlation['p-value'] < 1.0e-5], verbose=True)
filtered_correlations.sort_values('coefficient')

In [None]:
ax = abcutils.plot.correlation_vector_table(filtered_correlations, row_name_map=abcutils.CONFIG['metric_labels'])
ax.get_figure().set_size_inches(4, 0.4 * len(filtered_correlations))

In [None]:
correlations = None
for fs in FILE_SYSTEMS:
    # generate a single file system's correlation vector
    correlation = abcutils.correlation.calc_correlation_vector(
        df[df['_file_system'] == fs],
        correlate_with='darshan_normalized_perf_by_max')
            
    # rename the columns in this vector to include the file system name
    new_cols = {}
    for index, col_name in enumerate(correlation.columns):
        new_cols[col_name] = "%s %s" % (fs, col_name)
    correlation.rename(columns=new_cols, inplace=True)
    
    # join the vector to the previous vectors' dataframe
    if correlations is None:
        correlations = correlation
    else:
        correlations = pandas.concat([correlations, correlation], axis='columns')

correlations

In [None]:
# Construct filter to show any metric that registered a low p-value for _any_ file system
filters = None
for fs in FILE_SYSTEMS:
    subfilter = correlations['%s p-value' % fs] < 1.0e-5
    if filters is None:
        filters = subfilter
    else:
        filters |= subfilter

ax = abcutils.plot.correlation_vector_table(
    correlations[filters],
    row_name_map=abcutils.CONFIG['metric_labels'])

# Set the table width larger if displaying lots of metrics
ax.get_figure().set_size_inches(12, 0.4 * len(correlations[filters]))

## Boxplots

In [None]:
boxplot_settings = {
    'fontsize': 20,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction of\nPeak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14},
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14}]
    },
}

In [None]:
NUM_ROWS = 2
NUM_COLS = 2
fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(8,6)

SUBPLOT_ARRANGEMENT = {
    'scratch1': axes[0, 0],
    'scratch2': axes[1, 0],
    'scratch3': axes[0, 1],
    'cscratch': axes[1, 1]
}
NULL_SUBPLOTS = [
#    axes[0, 1],
]

### Draw subplots that contain data
for index, fs in enumerate(sorted(SUBPLOT_ARRANGEMENT.keys())):
    irow = index / NUM_COLS
    ax = SUBPLOT_ARRANGEMENT[fs]
    abcutils.plot.grouped_boxplot(df[df["_file_system"] == fs],
                                       'darshan_normalized_perf_by_max',
                                       ax=ax,
                                       fontsize=16)
    title = ax.set_title(fs, **(boxplot_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Hide subplots that do not contain data
for ax in NULL_SUBPLOTS:
    ax.set_visible(False)

### Set global figure labels 
fig.suptitle("")
fig.text(0.0, 0.5,
         boxplot_settings['darshan_normalized_perf_by_max']['ylabel'],
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=boxplot_settings['fontsize'])
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Scatter Plots

In [None]:
plot_metrics = [
    'coverage_factor_bw',
    'coverage_factor_nodehrs',
    'fshealth_ost_most_full_pct',
    'fs_tot_openclose_ops',
    'fs_tot_metadata_ops',
]

In [None]:
y_key = 'coverage_factor_bw'

df.loc[ (df[y_key] < 0.99) & (df[y_key] >= 0.01) ][y_key]

In [None]:
for fs in FILE_SYSTEMS:
    df_plot = df[df['_file_system'] == fs].dropna()
    for metric in plot_metrics:
        x_key = 'darshan_normalized_perf_by_max'
        y_key = metric

        ### throw out extremities in data
        y_min = numpy.percentile(df_plot[y_key].values, 0)
        y_max = numpy.percentile(df_plot[y_key].values, 95)
        filter_vector = (df_plot[y_key] < y_max) & (df_plot[y_key] >= y_min)
        x = df_plot.loc[filter_vector][x_key]
        y = df_plot.loc[filter_vector][y_key]

        ### or plot all of the data
#       x = df[x_key].values
#       y = df[y_key].values

        x_label = abcutils.CONFIG['metric_labels'].get(x_key, x_key)
        y_label = abcutils.CONFIG['metric_labels'].get(y_key, y_key)

        fig, ax = matplotlib.pyplot.subplots(figsize=(6,4))
        ax.hexbin(x, y, gridsize=10, cmap='PuRd')
        ax.plot(x, y, 'o', alpha=0.05)

        ### attempt a linear fit to generate a visual aid
        m, b = numpy.polyfit(x, y, 1)
        ax.plot(x, m*x+b, "-")


        ax.set_ylim((y_min, y_max))

        ### add window dressing to plots
        fig.suptitle('Correlation between %s and %s (%s)' 
                      % (x_label.split('(',1)[0].strip(),
                         y_label.split('(',1)[0].strip(),
                         fs))
        pearson_r, pval = scipy.stats.pearsonr(x, y)
        ax.set_title("Coefficient=%.4f, P-value=%.2g" 
                        % (pearson_r, pval), fontsize=14 )
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_label)
        ax.grid(True)
    #   output_file = "scatter_%s_vs_%s.pdf" % (x_key, y_key)
    #   fig.savefig(output_file, bbox_inches="tight")
    #   print "Saved %s" % output_file

## Histogram of Coverage Factor

In [None]:
NUM_ROWS = 2
NUM_COLS = 2
fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(10,8)

SUBPLOT_ARRANGEMENT = {
    'scratch1': axes[0, 0],
    'scratch2': axes[1, 0],
    'scratch3': axes[0, 1],
    'cscratch': axes[1, 1]
}
NULL_SUBPLOTS = [
#    axes[0, 1],
]

### Draw subplots that contain data
for index, fs in enumerate(sorted(SUBPLOT_ARRANGEMENT.keys())):
    irow = index / NUM_COLS
    ax = SUBPLOT_ARRANGEMENT[fs]

    y1 = df[df['_file_system'] == fs]['coverage_factor_bw'].dropna()
    y2 = df[df['_file_system'] == fs]['coverage_factor_nodehrs'].dropna()
    common_opts = {
        "width": 1.0/15.0,
        "bins": numpy.linspace(0.0, 1.0, 15),
        "alpha": 0.75,
        "linewidth": 3.0,
    #   "zorder": 9,
    }

    for y, label in [(y1, 'Coverage Factor (BW)'), (y2, 'Coverage Factor (NodeHrs)')]:
        ax.hist(y, label=label, **common_opts)

    ax.set_title(fs, fontsize=20)
    ax.set_xlabel("Coverage Factor", fontsize=16)
    ax.set_ylabel("Frequency", fontsize=16)
    ax.legend(fontsize=12)
    ax.yaxis.grid()
    ax.set_yscale("log")
    ax.set_ylim([1, 1e4])
    ax.xaxis.set_tick_params(labelsize=14)
    ax.yaxis.set_tick_params(labelsize=14)
    ax.label_outer()

    title = ax.set_title(fs, **(boxplot_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Hide subplots that do not contain data
for ax in NULL_SUBPLOTS:
    ax.set_visible(False)

### Set global figure labels 
fig.suptitle("")
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Umami Diagrams

In [None]:
import time
import datetime
import tokio.tools.umami

In [None]:
umami_diagrams = [
    # The "I/O contention" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch2',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 14),
            df['_datetime_start'] < datetime.datetime(2017, 3, 3, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'coverage_factor_nodehrs',
            'fs_ave_mds_cpu',
            'fs_tot_open_ops',
            'topology_job_max_radius',
        ],
    },
    # The "storage capacity" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch3',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 21, 0, 0, 0),
            df['_datetime_start'] < datetime.datetime(2017, 3, 15, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'fs_max_oss_cpu',
            'fshealth_ost_most_full_pct',
        ],
    },
]

pandas.options.display.max_rows = 11
filtered_df = abcutils.apply_filters(df, umami_diagrams[0]['filters'], verbose=True)
filtered_df.head().T

In [None]:
for umami_diagram in umami_diagrams:
    filtered_df = abcutils.apply_filters(df, umami_diagram['filters'], verbose=True)
    fig = abcutils.plot.generate_umami(filtered_df, umami_diagram['rows'])