# Daily I/O Rates

Calculate the distribution of daily I/O rates hitting a file system.  This is used to determine the minimum required endurance for an all-flash file system.  Note that this is __not__ the same as the daily growth rate, as users tend to write far more data than they retain.

In [None]:
%matplotlib inline

In [None]:
import datetime

import numpy
import scipy.stats
import matplotlib.pyplot
import pandas

In [None]:
matplotlib.rcParams['font.size'] = 16

In [None]:
START_TIME = datetime.datetime(2017, 4, 1)
END_TIME = datetime.datetime(2019, 3, 31)

CSCRATCH_KIBS = 29763608416864
CSCRATCH_GIBS = CSCRATCH_KIBS / 1024.0 / 1024.0
CSCRATCH_PIBS = CSCRATCH_GIBS / 1024.0 / 1024.0
def pibs_to_pbs(pibs):
    return pibs * 2**50 / 10**(5*3)

In [None]:
cscratch_df = pandas.read_csv('datasets/cscratch_daily_iorates.csv')
cscratch_df['date'] = [datetime.datetime.strptime(x, "%Y-%m-%d") for x in cscratch_df['date']]
cscratch_df = cscratch_df[(cscratch_df['date'] >= START_TIME) & (cscratch_df['date'] <= END_TIME)]
cscratch_df.index = cscratch_df['date']
cscratch_df = cscratch_df.drop(columns=['date'])

In [None]:
cscratch_df['fsrpd'] = cscratch_df['read_bytes'] / 2**30 / CSCRATCH_GIBS
cscratch_df['fswpd'] = cscratch_df['write_bytes'] / 2**30 / CSCRATCH_GIBS

In [None]:
cscratch_df['fswpd'].describe()

In [None]:
BINWIDTH = 0.0125
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 3.5))

#cscratch_df['fswpd'].hist(edgecolor='black',
#                          bins=numpy.arange(0, 0.3, BINWIDTH),
#                          width=BINWIDTH * 0.8,
#                          color='C0',
#                          ax=ax,
#                          density=True)

hist, bins = numpy.histogram(cscratch_df['fswpd'],
                             bins=numpy.arange(0, 0.3, BINWIDTH))

ax.bar(bins[:-1],
       hist.astype(numpy.float64) / hist.sum(),
       width=BINWIDTH * 0.8,
       color='C0',
       edgecolor='black',
       
      )

majtick = matplotlib.ticker.MultipleLocator(4*BINWIDTH)
mintick = matplotlib.ticker.MultipleLocator(BINWIDTH)
majtickfmt = matplotlib.ticker.FormatStrFormatter("%.2f")
ax.xaxis.set_major_locator(majtick)
ax.xaxis.set_minor_locator(mintick)
ax.xaxis.set_major_formatter(majtickfmt)
ax.tick_params(which='major', length=7)

ax.set_xlabel("File System Writes per Day")
#ax.set_ylabel("Number of days")
ax.set_ylabel("Fraction of days")
caption = "Cori scratch (%.1f PB)\n%s - %s" % (
    pibs_to_pbs(CSCRATCH_PIBS),
    START_TIME.strftime("%b %-d, %Y"),
    END_TIME.strftime("%b %-d, %Y"))
ax.text(0.98, 0.78, caption, fontsize='medium',
        ha='right', transform=ax.transAxes, backgroundcolor='#FFFFFFFF')

ax.set_axisbelow(True)
ax.xaxis.grid(False)


if False:
    ax.set_ylim(-10, None)
    for bar in ax.patches:
        x = bar.get_x()
        y = bar.get_height()
        if y > 0:
    #       ax.text(x + 0.0115, y + 25, int(y), ha='center')
            ax.text(x + BINWIDTH/2,
                    #y + y * 2.0 + 15 if y < 100 else y + 75,
                    y + 25,
                    int(y),
                    ha='center',
                    rotation=90 if y >= 10 else 0,
                    )

num_annotations = 0
for bar in ax.patches:
    x = bar.get_x()
    y = bar.get_height()
    total = hist.sum()
    ndays = y * total
    if 0 < ndays < 10:
        ax.annotate("%d day%s" % (ndays, "" if ndays == 1 else "s"),
                    xy=(x + BINWIDTH / 2, y + 0.01),
                    xycoords='data',
                    xytext=(0, 45),# + num_annotations * 10),
                    textcoords='offset points',
                    arrowprops={'facecolor': 'black', 'width': 1, "headwidth": 7, "shrink": 0.05},
                    ha='center',
                    va='bottom',
                    rotation=90
                   )
        num_annotations += 1
majtick = matplotlib.ticker.MultipleLocator(0.1)
mintick = matplotlib.ticker.MultipleLocator(0.05)
ax.yaxis.set_major_locator(majtick)
ax.yaxis.set_minor_locator(mintick)
ax.yaxis.grid()
ax.set_ylim(-0.05, None)

In [None]:
cscratch_df['fswpd'].describe()

In [None]:
(cscratch_df['read_bytes'] + cscratch_df['write_bytes']).describe() / 2**20
(cscratch_df['write_bytes']).describe() / 2**40

In [None]:
percentile = 99
print("%dth percentile is %.2f TiB" % (percentile,
                                       scipy.percentile((cscratch_df['write_bytes'] / 2**40), 95)))

In [None]:
percentile = 99
print("%dth percentile is %.6f FSWPD" % (percentile, 
                              scipy.percentile((cscratch_df['fswpd']), 99)))

In [None]:
output_file = 'cscratch_daily_iorates_%s-%s.pdf' % (START_TIME.strftime("%Y%m%d"), END_TIME.strftime("%Y%m%d"))
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)