# Write Amplification Factors

This notebook calculates the write amplification factors of all SSDs in Cori's burst buffer to estimate the effect of misaligned writes has on SSD drives given Cori's production workload.

This is a very imperfect analysis because

1. Cori's burst buffer (DataWarp) does server-side write-back caching which can even out misaligned but sequential I/Os
2. Cori's burst buffer has no parity, so there is no read-modify-write penalty
3. Cori's burst buffer uses multilevel striping, and the four-SSD RAID0 configuration uses 512 KiB stripes
4. Cori's burst buffer workload is not representative of the full Cori workload since only a small subset of NERSC users opts in to using the burst buffer

In [None]:
%matplotlib inline

In [None]:
import numpy
import pandas
import matplotlib
import matplotlib.pyplot
matplotlib.rcParams['font.size'] = 16

In [None]:
INPUT_CSV = 'datasets/isdct_summary_20190401.csv'

ssd_data = pandas.read_csv(INPUT_CSV)

SSD_DATA_DATE = INPUT_CSV.rsplit('_', 1)[-1].split('.', 1)[0]

BINWIDTH = 0.5

In [None]:
ssd_data['waf'] = ssd_data['smart_nand_bytes_written_bytes'] / ssd_data['smart_host_bytes_written_bytes']
ssd_data['lifetime_drive_writes'] = ssd_data['smart_host_bytes_written_bytes'] / (ssd_data['maximum_lba'] * 512) # 512 bytes per LBA 

In [None]:
def plot_hist(ssd_data, ax=None):
    if not ax:
        fig, ax = matplotlib.pyplot.subplots(figsize=(8, 3.5))
    else:
        fig = ax.get_figure()

    hist, bins = numpy.histogram(ssd_data['waf'],
                                 bins=numpy.arange(0, 10 + 2*BINWIDTH, BINWIDTH))

    ax.bar(bins[:-1] + BINWIDTH / 2,
           hist.astype(numpy.float64) / hist.sum(),
           width=BINWIDTH * 1.00,
           color='C1',
           edgecolor='black',

          )

#   majtick = matplotlib.ticker.MultipleLocator(2*BINWIDTH)
#   mintick = matplotlib.ticker.MultipleLocator(BINWIDTH)
#   majtickfmt = matplotlib.ticker.FormatStrFormatter("%d")
#   ax.xaxis.set_major_locator(majtick)
#   ax.xaxis.set_minor_locator(mintick)
#   ax.xaxis.set_major_formatter(majtickfmt)
#   ax.tick_params(which='major', length=7)

    ax.set_xlabel("Write Amplification Factor")
    #ax.set_ylabel("Number of days")
    ax.set_ylabel("Fraction of SSDs")

    ax.set_axisbelow(True)
    ax.xaxis.grid(True)
    
    majtick = matplotlib.ticker.MultipleLocator(0.1)
#   mintick = matplotlib.ticker.MultipleLocator(0.05)
    ax.yaxis.set_major_locator(majtick)
#   ax.yaxis.set_minor_locator(mintick)
    ax.yaxis.grid()
    ax.set_ylim(-0.05, None)

    num_annotations = 0
    for bar in ax.patches:
        x = bar.get_x()
        y = bar.get_height()
        total = hist.sum()
        ndays = y * total
        if 0 < ndays < 20:
            ax.annotate("%d drive%s" % (ndays, "" if ndays == 1 else "s"),
                        xy=(x + BINWIDTH / 2, y + 0.01),
                        xycoords='data',
                        xytext=(0, 45),# + num_annotations * 10),
                        textcoords='offset points',
                        arrowprops={'facecolor': 'black', 'width': 1, "headwidth": 7, "shrink": 0.05},
                        ha='center',
                        va='bottom',
                        rotation=90
                       )
            num_annotations += 1

    return ax

In [None]:
def plot_scatter(ssd_data, ax=None):
    
    if not ax:
        fig, ax = matplotlib.pyplot.subplots(figsize=(8, 3.5))
    else:
        fig = ax.get_figure()

    x = ssd_data['waf']
#   y = ssd_data['smart_host_bytes_written_bytes'] / 10**12
    y = ssd_data['lifetime_drive_writes']

    ax.scatter(x, y, marker='x', color='black', alpha=0.25)

    ax.set_xlabel("WAF")
    ax.set_ylabel("Lifetime\nDrive Writes")
    ax.grid()
    ax.set_axisbelow(True)

    majtick = matplotlib.ticker.MultipleLocator(2*BINWIDTH)
#   mintick = matplotlib.ticker.MultipleLocator(BINWIDTH)
    majtickfmt = matplotlib.ticker.FormatStrFormatter("%d")
    ax.xaxis.set_major_locator(majtick)
#   ax.xaxis.set_minor_locator(mintick)
    ax.xaxis.set_major_formatter(majtickfmt)
    ax.tick_params(which='major', length=7)
    
    return ax

In [None]:
fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True)
fig.subplots_adjust(hspace=0.0, wspace=0.0)

plot_hist(ssd_data, ax=axes[0])
plot_scatter(ssd_data, ax=axes[1])

In [None]:
output_file = 'coribb_ssd_wafs_%s.pdf' % SSD_DATA_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

In [None]:
ax = plot_hist(ssd_data)
output_file = 'coribb_ssd_wafs_hist_%s.pdf' % SSD_DATA_DATE
ax.get_figure().savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

In [None]:
(ssd_data['power_on_hours'] / 24 / 365.25).describe()

In [None]:
(ssd_data['waf']).describe()

In [None]:
print("%.2f  5th percentile\n%.2f 95th percentile" % (numpy.percentile(ssd_data['waf'], 5), numpy.percentile(ssd_data['waf'], 95)))