In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils
import matplotlib
matplotlib.rcParams.update({'font.size': 16})

## Global Analysis Constants

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'cscratch@cori-haswell',
    'mira-fs1@mira'
]

TO_TIB = 2.0**(-40.0)

## Load and Synthesize Data from CSV

In [None]:
CACHE_FILE = 'cache.hdf5'
if CACHE_FILE and os.path.isfile(CACHE_FILE):
    print "Loading from cache %s" % CACHE_FILE
    df = pandas.read_hdf(CACHE_FILE, 'summary')
else:
    df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-15.csv', system='edison'),
                        abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2018-02-15.csv', system='cori'),
                        abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                       axis='rows')
    if CACHE_FILE:
        df.to_hdf(CACHE_FILE, key='summary', mode='w', format='fixed', complevel=9, complib='zlib')
        print "Cached synthesized CSV to %s" % CACHE_FILE
    
# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

df['fs_tot_bytes_tibs'] = (df['fs_tot_bytes']) * 2.0**(-40.0)

for index in df[df['coverage_factor_bw'] > 1.2].index:
    df.loc[index, 'coverage_factor_bw'] = numpy.nan

In [None]:
df

In [None]:
test_platform = 'mira-fs1@mira'
benchmark_id = 'ior_fpp_write'
perf_metric = 'darshan_normalized_perf_by_max'
vol_metric = 'fs_tot_bytes_tibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 2, 15)
group_by = [ '_test_platform', '_benchmark_id' ]

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

In [None]:
def timeseries_smaplot(dataframe, plot_metric, date_start, date_end, window=10, ax=None, **kwargs):
    sma_series = abcutils.features.calculate_sma(dataframe=dataframe,
                                                 x_column='_datetime_start',
                                                 y_column=plot_metric,
                                                 window=window)
    x = [abcutils.core.pd2epoch(x) for x in sma_series.index]
    y = sma_series.values

    ax.plot(x, y, linestyle='-', label=str(window)+'-day sma', **kwargs)

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=4, ncols=1, sharex=True)
fig.set_size_inches(20, 12)

sma_df = filtered_df[filtered_df['_datetime_start'] >= date_start]
sma_df = sma_df[filtered_df['_datetime_end'] < date_end]

x_raw = sma_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_perf_raw = sma_df[perf_metric]
y_vol_raw = sma_df[vol_metric]

ax = axes[0]
ax.plot(x_raw, y_perf_raw, linestyle='-', marker='.')
ax.grid()
ax.set_title("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d") for x in ax.get_xticks()])
ax.set_ylim(0, 1)
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', 'darshan_normalized_perf_by_max'))

ax2 = ax.twinx()
ax2.bar(x_raw.values, y_vol_raw, width=(60*60*10), **{'color': 'C1'})
ax2.set_ylabel('Total Volume (TiB)')
ax2.set_ylim(0, 15)
yticks = numpy.arange(0, 6, 1)
ax2.set_yticks(numpy.arange(0, 6, 1))
ax2.set_yticklabels(numpy.arange(0, 6, 1))

ax = axes[1]
timeseries_smaplot(sma_df, perf_metric, date_start, date_end, 10, ax=ax, color='C1')
timeseries_smaplot(sma_df, perf_metric, date_start, date_end, 50, ax=ax, color='C2')
timeseries_smaplot(sma_df, perf_metric, date_start, date_end, 100, ax=ax, color='C3')
ax.grid()
#ax.set_ylim(0, 1)
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get('darshan_normalized_perf_by_max', 'darshan_normalized_perf_by_max'))
ax.legend()

ax = axes[2]
timeseries_smaplot(sma_df, vol_metric, date_start, date_end, 10, ax=ax, color='C1')
timeseries_smaplot(sma_df, vol_metric, date_start, date_end, 50, ax=ax, color='C2')
timeseries_smaplot(sma_df, vol_metric, date_start, date_end, 100, ax=ax, color='C3')
ax.grid()
ax.set_ylabel('Total Volume (TiB)')
ax.set_ylim(0, 5)
ax2.set_yticks(numpy.arange(0, 6, 1))
#ax2.set_yticklabels(numpy.arange(0, 6, 1))
ax.legend()

ax = axes[3]
timeseries_smaplot(sma_df, 'coverage_factor_bw', date_start, date_end, 10, ax=ax, color='C1')
timeseries_smaplot(sma_df, 'coverage_factor_bw', date_start, date_end, 50, ax=ax, color='C2')
timeseries_smaplot(sma_df, 'coverage_factor_bw', date_start, date_end, 100, ax=ax, color='C3')
ax.scatter(x_raw, sma_df['coverage_factor_bw'], marker='.', alpha=0.25)
ax.grid()
ax.set_ylabel('Bandwidth\nCoverage\nFactor')
ax.set_ylim(0, 1.2)
ax.legend()

fig.subplots_adjust(hspace=0.1, wspace=0.1)

pass

## Quantify Correlation

Quantify the correlation between the total volume of bytes moved and the fraction peak performance of each job.

In [None]:
correlate_df = sma_df[['_datetime_start', perf_metric, vol_metric]].dropna()

sma_perf = abcutils.features.calculate_sma(correlate_df, '_datetime_start', perf_metric, 10).dropna()
sma_vol = abcutils.features.calculate_sma(correlate_df, '_datetime_start', vol_metric, 10).dropna()

print "Correlation coefficient (out of 1.0): %.3f\np-value (< 1.0e-5 is good): %.3e" % scipy.stats.pearsonr(x=sma_perf, y=sma_vol)

fig, ax = matplotlib.pyplot.subplots()
ax.scatter(sma_perf, sma_vol, marker='.')
ax.set_xlabel("Fraction Peak Performance")
ax.set_ylabel("Total Data Moved (TiBs)")
ax.grid()