In [None]:
%matplotlib inline
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import os
import time
import datetime
import pandas
import numpy
import scipy.stats
import abcutils
import matplotlib
matplotlib.rcParams.update({'font.size': 16})

## Global Analysis Constants

In [None]:
TEST_PLATFORMS = [
    'scratch1@edison',
#   'scratch2@edison',
    'scratch3@edison',
    'cscratch@cori-knl',
    'mira-fs1@mira'
]

## Load and Synthesize Data from CSV

In [None]:
filtered_df = abcutils.sc18paper.load_dataset()

## Correlation Table

First show the most compelling correlations across all data.  This will be messy because it includes all file systems and test conditions, so there are many uncontrolled variables represented.

In [None]:
pandas.options.display.max_rows = 40

correlation = abcutils.correlation.calc_correlation_vector(filtered_df, correlate_with='darshan_normalized_perf_by_max')

filtered_correlations = abcutils.apply_filters(correlation, [correlation['p-value'] < 1.0e-5], verbose=True)
filtered_correlations.sort_values('coefficient')

In [None]:
ax = abcutils.plot.correlation_vector_table(filtered_correlations, row_name_map=abcutils.CONFIG['metric_labels'])
ax.get_figure().set_size_inches(4, 0.4 * len(filtered_correlations))

Now draw the entire correlation table split out by _test platform_--a combination of the file system being tested and the node configuration being used to test it.

In [None]:
correlations = None
grouped_df = filtered_df.groupby('_test_platform')
for fs in TEST_PLATFORMS:
    # generate a single file system's correlation vector
    correlation = abcutils.correlation.calc_correlation_vector(
        grouped_df.get_group(fs),
        correlate_with='darshan_normalized_perf_by_max')
            
    # rename the columns in this vector to include the file system name
    new_cols = {}
    for index, col_name in enumerate(correlation.columns):
        new_cols[col_name] = "%s %s" % (fs, col_name)
    correlation.rename(columns=new_cols, inplace=True)
    
    # join the vector to the previous vectors' dataframe
    if correlations is None:
        correlations = correlation
    else:
        correlations = pandas.concat([correlations, correlation], axis='columns')

In [None]:
# Construct filter to show any metric that registered a low p-value for _any_ file system
filters = None
for fs in TEST_PLATFORMS:
    subfilter = correlations['%s p-value' % fs] < 1.0e-5
    if filters is None:
        filters = subfilter
    else:
        filters |= subfilter

ax = abcutils.plot.correlation_vector_table(
    correlations[filters],
    row_name_map=abcutils.CONFIG['metric_labels'])

# Set the table width larger if displaying lots of metrics
ax.get_figure().set_size_inches(20, 0.4 * len(correlations[filters]))