In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import matplotlib
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils

## Global Analysis Constants

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
    'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'cscratch@cori-haswell',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

In [None]:
df = pandas.concat([abcutils.load_and_synthesize_csv('summaries/edison-summaries_2017-02-14-2017-12-30.csv', system='edison'),
                    abcutils.load_and_synthesize_csv('summaries/cori-summaries_2017-02-14-2017-12-31.csv', system='cori'),
                    abcutils.load_and_synthesize_csv('summaries/alcf-tokio-results-2_14_17-2_15_18.csv', system='mira')],
                   axis='rows')

# Experimenting with Feature Detection

## Identifying Streaks

Find cases where performance monotonically increased or decreased over the course of several consecutive days.

In [None]:
print """
Valid benchmark_id values:
========================="""
print "\n".join(df['_benchmark_id'].unique())

print """
Valid test_platform values:
=========================="""
print "\n".join(df['_test_platform'].unique())

In [None]:
test_platform = 'cscratch@cori-knl'
benchmark_id = 'ior_fpp_read'
plot_metric = 'darshan_normalized_perf_by_max'
date_start = datetime.datetime(2017, 2, 14)
date_end = date_start + datetime.timedelta(days=365)
min_streak = 3 # in days by default

filtered_df = df.groupby(by=group_by).get_group((test_platform, benchmark_id))
filtered_df = filtered_df[filtered_df['_datetime_start'] < date_end]
filtered_df = filtered_df[filtered_df['_datetime_start'] >= date_start]

In [None]:
fig, ax = matplotlib.pyplot.subplots()
fig.set_size_inches(20,4)

ax = abcutils.plot.timeseries_boxplot(filtered_df, plot_metric, date_start, date_end, ax=ax)

streaks = abcutils.features.find_streaks(filtered_df[plot_metric], min_streak=min_streak)
for streak in streaks:
    x = [time.mktime((filtered_df.iloc[x]['_datetime_start']).timetuple()) for x in streak[0]]
    if streak[1][-1] > streak[1][0]:
        color = 'green'
    else:
        color = 'red'
    ax.plot(x,
            streak[1],
            marker='',
            linestyle='-',
            linewidth=4,
            color=color,
            markersize=5,
            markerfacecolor=color)
xlabel = "Week"
ylabel = "%s\n(%s)" % (abcutils.CONFIG['benchmark_labels'].get(benchmark_id, benchmark_id),
                       abcutils.CONFIG['metric_labels'].get(plot_metric, plot_metric))
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_ylim(0,1)

pass

## Correlating during streaks

Look for interesting correlations among the filtered data.  First calculate the correlation vectors for each test platform and assemble a dataframe from those correlation vectors.

In [None]:
correlations = None
for fs in TEST_PLATFORMS:
    # Start with a single file system worth of data
    df_to_correlate = df[(df['_test_platform'] == fs) & (df['_benchmark_id'] == benchmark_id)]

    # Find streaks from the dataframe
    streaks = abcutils.features.find_streaks(df_to_correlate[plot_metric], min_streak=min_streak)
    
    # Build a filter vector that only returns data from streaks
    up_filter = [False] * len(df_to_correlate)
    down_filter = [False] * len(df_to_correlate)
    for streak in streaks:
        if streak[1][-1] > streak[1][0]:
            for index in streak[0]:
                up_filter[index] = True
        else:
            for index in streak[0]:
                down_filter[index] = True
#   streak_filter = numpy.array(up_filter) | numpy.array(down_filter)
    streak_filter = numpy.array(down_filter)

    # Calculate the correlation data for this vector
    correlation = abcutils.correlation.calc_correlation_vector(
        df_to_correlate[streak_filter],
        correlate_with='darshan_normalized_perf_by_max')
    
    # rename the columns in this vector to include the file system name
    new_cols = {}
    for index, col_name in enumerate(correlation.columns):
        new_cols[col_name] = "%s %s" % (fs, col_name)
    correlation.rename(columns=new_cols, inplace=True)
    
    # join the vector to the previous vectors' dataframe
    if correlations is None:
        correlations = correlation
    else:
        correlations = pandas.concat([correlations, correlation], axis='columns')

# Only draw metrics that show something interesting
confidence_filter = None
for fs in TEST_PLATFORMS:
    subfilter = correlations['%s p-value' % fs] < 1.0e-5
    subfilter = (correlations['%s coefficient' % fs] > 0.30) | (correlations['%s coefficient' % fs] < -0.30)
    if confidence_filter is None:
        confidence_filter = subfilter
    else:
        confidence_filter |= subfilter

In [None]:
# Draw the actual table
ax = abcutils.plot.correlation_vector_table(
    correlations[confidence_filter],
    row_name_map=abcutils.CONFIG['metric_labels'])

# Set the table width larger if displaying lots of metrics
ax.get_figure().set_size_inches(24, 0.4 * len(correlations[confidence_filter]))

## Correlating only the top and bottom quartiles

Instead of correlating against areas where performance increased or decreased over a long period of time, we just take the top and/or bottom quartiles and correlate across the worst and/or best observed performance.

In [None]:
group_by = [ '_test_platform', '_benchmark_id' ]

In [None]:
summarized_metrics_grouped = df.groupby(by=group_by).describe()

In [None]:
summarized_metrics_grouped.loc['scratch1@edison', 'ior_fpp_read']['coverage_factor_bw']

In [None]:
group_by = ['_test_platform', '_benchmark_id']
summarized_metrics_grouped = df.groupby(by=group_by).describe()
    
correlations = None
for fs in TEST_PLATFORMS:
    # Start with a single file system worth of data
#   df_to_correlate = df[(df['_test_platform'] == fs) & (df['_benchmark_id'] == benchmark_id)]
    df_to_correlate = df[df['_test_platform'] == fs]
    
    bottom_cutoff = summarized_metrics_grouped.loc[fs, benchmark_id]['darshan_normalized_perf_by_max']['25%']
    median_cutoff = summarized_metrics_grouped.loc[fs, benchmark_id]['darshan_normalized_perf_by_max']['25%']
    top_cutoff = summarized_metrics_grouped.loc[fs, benchmark_id]['darshan_normalized_perf_by_max']['75%']
#   cutoff_filter = (df_to_correlate['darshan_normalized_perf_by_max'] < bottom_cutoff)
#   cutoff_filter = (df_to_correlate['darshan_normalized_perf_by_max'] > top_cutoff)
    cutoff_filter = (df_to_correlate['darshan_normalized_perf_by_max'] < median_cutoff)


    
    # Calculate the correlation data for this vector
    correlation = abcutils.correlation.calc_correlation_vector(
        df_to_correlate[cutoff_filter],
        correlate_with='darshan_normalized_perf_by_max')
    
    # rename the columns in this vector to include the file system name
    new_cols = {}
    for index, col_name in enumerate(correlation.columns):
        new_cols[col_name] = "%s %s" % (fs, col_name)
    correlation.rename(columns=new_cols, inplace=True)
    
    # join the vector to the previous vectors' dataframe
    if correlations is None:
        correlations = correlation
    else:
        correlations = pandas.concat([correlations, correlation], axis='columns')

# Only draw metrics that show something interesting
confidence_filter = None
for fs in TEST_PLATFORMS:
    subfilter = correlations['%s p-value' % fs] < 1.0e-5
    subfilter = (correlations['%s coefficient' % fs] > 0.30) | (correlations['%s coefficient' % fs] < -0.30)
    if confidence_filter is None:
        confidence_filter = subfilter
    else:
        confidence_filter |= subfilter

In [None]:
# Draw the actual table
ax = abcutils.plot.correlation_vector_table(
    correlations[confidence_filter],
    row_name_map=abcutils.CONFIG['metric_labels'])

# Set the table width larger if displaying lots of metrics
ax.get_figure().set_size_inches(24, 0.4 * len(correlations[confidence_filter]))

In [None]:
correlations[confidence_filter]