# Correlating performance with the calendar

This notebook attempts to find correlations between performance and different elements of the Gregorian calendar.  Specifically, it attempts to find days on which scheduled maintenance may be causing recurring performance problems for users.

In [None]:
%matplotlib inline

In [None]:
import os
import time
import datetime
import warnings
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import pandas
import numpy
import scipy.stats
import abcutils

In [None]:
# Below what p-value should we consider correlation significant?
SIGNIFICANT_P = 0.05

In [None]:
df = abcutils.sc18paper.load_raw_datasets(
    input_datasets={
#       'edison': 'summaries/edison-summaries_2017-02-14-2018-08-09.csv',
        'cori': 'summaries/cori-summaries_2017-02-14-2019-01-31.csv'
    },
    cache_file='cori-summaries_2017-02-14-2019-01-31.hdf5')

In [None]:
analyze_df = df[['_datetime_start', 'darshan_normalized_perf_by_max']].copy()

In [None]:
analyze_df['day_of_month'] = [x.day for x in df['_datetime_start']]
analyze_df['day_of_week'] = [x.isoweekday() for x in df['_datetime_start']]

## Average as a function of day of month

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))
avg_perf = [None] * 32
errorbars = [None] * 32
for x in range(1, 32):
    avg_perf[x] = analyze_df[analyze_df['day_of_month'] == x]['darshan_normalized_perf_by_max'].mean()
    errorbars[x] = analyze_df[analyze_df['day_of_month'] == x]['darshan_normalized_perf_by_max'].std()
ax.plot(range(1, 32), 
             avg_perf[1:],
             marker='.')
#ax.errorbar(range(1, 32), errorbars[1:], label="Std Dev")
ax.set_xlabel("Day of month")
ax.set_ylabel("Fraction of peak I/O performance")
ax.grid()
ax.set_title("Average daily I/O benchmark performance")

Adding error bars based on the standard deviation of performance on each day of the month:

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))
ax.errorbar(range(1, 32), 
             avg_perf[1:],
             marker='.',
#            label="Mean",
             yerr=errorbars[1:])
#ax.errorbar(range(1, 32), errorbars[1:], label="Std Dev")
ax.set_xlabel("Day of month")
ax.set_ylabel("Fraction of peak I/O performance")
ax.grid()
ax.set_title("Average daily I/O benchmark performance")
ax.set_ylim(0, 1)

## Average as a function of day of week

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))
avg_perf = [None] * 32
errorbars = [None] * 32
for x in range(1, 32):
    avg_perf[x] = analyze_df[analyze_df['day_of_week'] == x]['darshan_normalized_perf_by_max'].mean()
    errorbars[x] = analyze_df[analyze_df['day_of_week'] == x]['darshan_normalized_perf_by_max'].std()
ax.plot(range(1, 32), 
             avg_perf[1:],
             marker='.')
#ax.errorbar(range(1, 32), errorbars[1:], label="Std Dev")
ax.set_xlabel("Day of month")
ax.set_xticks(range(1,8))
ax.set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
ax.set_ylabel("Fraction of peak I/O performance")
ax.grid()
ax.set_title("Average daily I/O benchmark performance")

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))
ax.errorbar(range(1, 32), 
             avg_perf[1:],
             marker='.',
#            label="Mean",
             yerr=errorbars[1:])
#ax.errorbar(range(1, 32), errorbars[1:], label="Std Dev")
ax.set_xlabel("Day of month")
ax.set_xticks(range(1,8))
ax.set_xticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
ax.set_ylabel("Fraction of peak I/O performance")
ax.grid()
ax.set_title("Average daily I/O benchmark performance")
ax.set_ylim(0, 1)

## Correlation over week-long windows

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))

window_days = 7
month_days_max = 31

cvals = []
weeks = []

for week in range(month_days_max // window_days + 1):
    domstart = week * window_days + 1
    domend = (week + 1) * window_days

    filt = analyze_df['day_of_month'] >= domstart
    filt &= analyze_df['day_of_month'] <= domend
    cval, pval = scipy.stats.pearsonr(analyze_df[filt]['day_of_month'], analyze_df[filt]['darshan_normalized_perf_by_max'])
    print("Day %2d to %2d: correlation = %10.2e, p-value = %10.2e" % (domstart, domend, cval, pval))
    cvals.append(cval)
    weeks.append(week)


ax.plot(weeks, cvals,'-', marker='o')
ax.set_xlabel("Time in month")
ax.set_xticklabels([])
ax.set_ylabel("Correlation coefficient")
ax.grid()
ax.set_title("Correlation between good performance and day in month")


In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 6))

window_days = 7
month_days_max = 31

cvals = []

for domstart in range(1, month_days_max - window_days):
    domend = domstart + window_days

    filt = analyze_df['day_of_month'] >= domstart
    filt &= analyze_df['day_of_month'] <= domend
    cval, pval = scipy.stats.pearsonr(analyze_df[filt]['day_of_month'], analyze_df[filt]['darshan_normalized_perf_by_max'])
    print("Day %2d to %2d: correlation = %10.2e, p-value = %10.2e" % (domstart, domend, cval, pval))
    cvals.append((cval, pval))

ax.scatter(range(1, month_days_max - window_days),
        [x[0] for x in cvals],
        s=-50.0 * numpy.log10([x[1] for x in cvals]),
#       '-',
        marker='o')
ax.set_xlabel("Day of month")
#x.set_xticklabels([])
ax.set_ylabel("Correlation coefficient")
ax.grid()
ax.set_axisbelow(True)
ax.set_title("Correlation between good performance and day in month")