In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
import datetime
import matplotlib
import pandas
import abcutils

## Load and Synthesize Data from CSV

In [None]:
df_nersc = abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-15.csv.gz')
df_alcf = abcutils.load_and_synthesize_csv('summaries/mira-summaries_2017-02-14_2018-02-15.csv.gz')

In [None]:
df = pandas.concat([df_nersc, df_alcf]).reindex()#.drop(columns=['Unnamed: 0'])

# PDSW paper applied a few filters
filter_criteria = df['coverage_factor_bw'] < 1.2
filter_criteria &= (df['_file_system'] != 'mira-fs1') | (df['_jobid'] != 1039807)
filter_criteria &= df['_datetime_start'] >= datetime.datetime(2016, 2, 24, 0, 0, 0)
filter_criteria &= df['_datetime_end'] <= datetime.datetime(2017, 3, 25, 0, 0, 0)
df = df[filter_criteria]

print sum(df['_file_system'] != 'mira-fs1')
print sum(df['_file_system'] == 'mira-fs1')

Demonstrate how columns can be normalized.  This is the routine used to calculate the fraction of peak performance metric that is calculated for every job.

In [None]:
target_col = 'darshan_agg_perf_by_slowest_posix'
group_by_cols = ['darshan_app', '_file_system', 'darshan_fpp_or_ssf_job', 'darshan_read_or_write_job']
new_col_base = 'darshan_normalized_perf'

# modifies the dataframe in-place; returns nothing
abcutils.normalize_column(
    dataframe=df,
    target_col=target_col,
    group_by_cols=group_by_cols,
    new_col_base=new_col_base)

## Boxplots

In [None]:
boxplot_settings = {
    'fontsize': 20,
    'darshan_normalized_perf_by_max': {
        'output_file': "perf-boxplots.pdf",
        'ylabel': "Fraction of\nPeak Performance",
        'title_pos': [ 
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14},
            {'x': 0.04, 'y': 0.02, 'horizontalalignment': 'left', 'fontsize': 14}]
    },
}

Boxplots can also be inserted into existing figures with a little more effort.  This can be handy for creating compact publication-ready diagrams.

In [None]:
NUM_ROWS = 2
NUM_COLS = 2
fig, axes = matplotlib.pyplot.subplots(nrows=NUM_ROWS,
                                       ncols=NUM_COLS,
                                       # sharex causes problems if not all axes contain data
                                       #sharex=True,
                                       sharey=True)
fig.set_size_inches(8,6)

SUBPLOT_ARRANGEMENT = {
    'scratch1': axes[0, 0],
    'scratch2': axes[1, 0],
    'scratch3': axes[0, 1],
    'mira-fs1': axes[1, 1],
}
NULL_SUBPLOTS = [
]

### Draw subplots that contain data
for index, fs in enumerate(sorted(SUBPLOT_ARRANGEMENT.keys())):
    irow = index / NUM_COLS
    ax = SUBPLOT_ARRANGEMENT[fs]
    abcutils.plot.grouped_boxplot(df[df["_file_system"] == fs],
                                       'darshan_normalized_perf_by_max',
                                       ax=ax,
                                       fontsize=16)
    title = ax.set_title(fs, **(boxplot_settings['darshan_normalized_perf_by_max']['title_pos'][irow]))
    title.set_bbox({'color': 'white', 'alpha': 0.5})

### Hide subplots that do not contain data
for ax in NULL_SUBPLOTS:
    ax.set_visible(False)

### Set global figure labels 
fig.suptitle("")
fig.text(0.0, 0.5,
         boxplot_settings['darshan_normalized_perf_by_max']['ylabel'],
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical',
         fontsize=boxplot_settings['fontsize'])
fig.subplots_adjust(hspace=0.05, wspace=0.05)

## Umami Diagrams

In [None]:
import time
import datetime
import tokio.analysis.umami

In [None]:
umami_diagrams = [
    # The "I/O contention" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch2',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 14),
            df['_datetime_start'] < datetime.datetime(2017, 3, 3, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'coverage_factor_nodehrs',
            'fs_ave_mds_cpu',
            'fs_tot_open_ops',
            'topology_job_max_radius',
        ],
        'options': {},
    },
    # The "metadata load" case study figure
    {
        'filters': [
            df['_file_system'] == 'mira-fs1',
            df['darshan_app'] == 'vpicio_uni',
            df['_datetime_start'] > datetime.datetime(2017, 3, 1, 0, 0, 0),
            df['_datetime_start'] < datetime.datetime(2017, 3, 12, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'coverage_factor_ops',
            'fs_tot_readdir_ops',
        ],
        'options': {},
    },
    # The "storage capacity" case study figure
    {
        'filters': [
            df['_file_system'] == 'scratch3',
            df['darshan_app'] == 'hacc_io_write',
            df['darshan_read_or_write_job'] == 'write',
            df['_datetime_start'] > datetime.datetime(2017, 2, 21, 0, 0, 0),
            df['_datetime_start'] < datetime.datetime(2017, 3, 15, 0, 0, 0),
        ],
        'rows': [
            'darshan_agg_perf_by_slowest_posix',
            'coverage_factor_bw',
            'fs_max_oss_cpu',
            'fshealth_ost_most_full_pct',
        ],
        'options': {
            'highlight_index': -3,
        },
    },
]

pandas.options.display.max_rows = 11
filtered_df = abcutils.apply_filters(df, umami_diagrams[0]['filters'], verbose=True)
filtered_df.head().T

In [None]:
for umami_diagram in umami_diagrams:
    filtered_df = abcutils.apply_filters(df, umami_diagram['filters'], verbose=True)
    fig = abcutils.plot.generate_umami(filtered_df, umami_diagram['rows'], **umami_diagram['options'])