In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import time
import datetime
import warnings
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
#   'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

This process loads each summary CSV file, creates a few derived metrics, and then merges each system's CSV into a single global dataset that can be sliced and diced by system, benchmark, or any other way.

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2018-02-28.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2018-02-28.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

# Reset the index to ensure that there are no degenerate indices in the final dataframe
df.index = pandas.Index(data=numpy.arange(len(df)), dtype='int64')

In [None]:
pd2epoch = lambda x: time.mktime(x.to_pydatetime().timetuple())

In [None]:
group_by = ['_test_platform', '_benchmark_id']

## Set Input Parameters

In [None]:
#test_platform = 'scratch2@edison'
test_platform = 'cscratch@cori-knl'
#test_platform = 'mira-fs1@mira'

benchmark_id = 'ior_fpp_write'
# benchmark_id = 'dbscan_read_shared_read'
# benchmark_id = 'ior_shared_write'

plot_metric = 'darshan_agg_perf_by_slowest_posix_gibs'
date_start = datetime.datetime(2017, 2, 14)
date_end = datetime.datetime(2018, 3, 1)
group_by = ['_test_platform', '_benchmark_id']
delta = datetime.timedelta(days=1).total_seconds()

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['darshan_total_gibs_posix'] > 1.0]
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

print "test_platform =", test_platform
print "benchmark_id =", abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id)
print "plot_metric =", abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric)
print "date_start =", date_start.isoformat()
print "date_end =", date_end.isoformat()

In [None]:
# Width of simple moving average (SMA) short/long windows
short_window = 7
long_window = 28

In [None]:
# Metrics to include in UMAMI renderings
umami_rows = [
    'darshan_agg_perf_by_slowest_posix',
    'coverage_factor_bw',
    'coverage_factor_nodehrs',
    'fs_ave_mds_cpu',
    'fs_tot_open_ops',
    'topology_job_max_radius',
]

## Calculate Simple Moving Averages (SMAs)

Compare a short-window SMA and a long-window SMA and use that to identify points of interest.

In [None]:
sma_intercepts = abcutils.features.sma_intercepts(filtered_df, plot_metric, short_window, long_window)
sma_minmax = abcutils.features.sma_local_minmax(filtered_df, plot_metric, short_window, long_window, min_domain=7)

Identify loci and plot them amidst all of the benchmark data for some visual sanity checking.

In [None]:
loci_df = filtered_df.loc[sma_minmax[~sma_minmax['positive']].index]
loci = [pd2epoch(x) for x in loci_df['_datetime_start']]
print "Found %d loci" % (len(loci))

In [None]:
x_raw = filtered_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_raw = filtered_df[plot_metric]

x_low = loci_df['_datetime_start'].apply(lambda x: time.mktime(x.timetuple()))
y_low = loci_df[plot_metric]

### plot the raw data
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(16, 4)
ax.grid()
ax.bar(x_raw, y_raw, width=delta, alpha=0.5)
ax.bar(x_low, y_low, width=delta, color='red')
ax.set_ylabel(abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric).replace(" ", "\n"))
ax.set_xticklabels([datetime.datetime.fromtimestamp(x).strftime("%b %d") for x in ax.get_xticks()])
fig.suptitle("%s on %s" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                           test_platform))

## Generate UMAMI Diagrams Around Loci

Combine the start/stop points with the loci so that we can generate UMAMI diagrams that range from the loci to the starting boundary

In [None]:
sma_minmax['minmax'] = True
boundary_df = pandas.concat((sma_minmax, sma_intercepts)).sort_index()

Generate a list of dataframe views that contain only the data that we want to populate into an UMAMI diagram

In [None]:
umamis = []

prev_row = None
for row in boundary_df.itertuples():
    if prev_row is not None and row.minmax is True and row.positive is False:
        date_filter = filtered_df['_datetime_start'] >= prev_row[1]
        date_filter |= filtered_df['_datetime_start'] >= row[1] - datetime.timedelta(days=7)
        date_filter &= filtered_df['_datetime_start'] <= row[1]
        umamis.append(filtered_df[date_filter])
    prev_row = row

In [None]:
for index, umami_df in enumerate(umamis):
    print "Umami %2d will show %d days" % (index, len(umami_df))
    abcutils.plot.generate_umami(umami_df, umami_rows)