# Project Data-on-MDT Requirements

Uses an empirically derived file size distribution to project how much Data-on-MDT capacity would be required for a future Lustre file system of arbitrary size.

This analysis assumes all non-file inodes have a minimum size given by `LUSTRE_BYTES_PER_INODE`.  This is a reasonable but conservative estimate.  Our choice of `LUSTRE_BYTES_PER_INODE` here is 4 KiB, which is

1. probably an over-estimate for ldiskfs; modern Lustre versions default to something closer to 2.5 KiB
2. sufficient for zfs, as ZFS triplicates metadata

Note that at the block level, inodes are rounded up to a full block allocation (4 KiB) and the efficiency with which Lustre utilizes that allocated block depends on what features are in use to pack blocks.  Capturing these nuances in the following model is over-precise given the inaccuracy intrinsic to the simplicity of the mode, so we just wrap everything into the above assumption about `LUSTRE_BYTES_PER_INODE`.

In [None]:
%matplotlib inline

In [None]:
import os
import sqlite3

import matplotlib
matplotlib.rcParams['font.size'] = 16
import matplotlib.pyplot
import pandas
import numpy
import fsanalysis.histogram as histogram

TO_BYTE = 1
TO_KIB = 2**(-10)
TO_MIB = 2**(-20)
TO_GIB = 2**(-30)
TO_TIB = 2**(-40)
TO_PIB = 2**(-50)

# The SQLite database containing the file size distribution
# INPUT_DB_FILE = 'datasets/cscratch_20181109_sizebytype.sqlite'
INPUT_DB_FILE = 'datasets/cscratch_20190115_sizebytype.sqlite'
INPUT_DB_DATE = INPUT_DB_FILE.split('_')[1]

TARGET_TOTAL_OST_CAPACITY = 30.0 / TO_PIB
# for reference, cscratch is formatted for 1.5 KiB/inode on master MDT; 3.5 KiB/inode for secondaries
LUSTRE_BYTES_PER_INODE = 4.0 / TO_KIB

# Types of inodes in our dataframe
INODE_TYPES = ['files', 'dirs', 'symlinks', 'blks', 'chrs', 'fifos', 'socks']

# we have to distinguish file inodes from non-file inodes because file inodes' "size"
# is the size of the file rather than the size of the inode.  all other inodes' "size"
# is the size they take up on the MDT.
NON_FILE_INODE_TYPES = INODE_TYPES[1:]

In [None]:
def humanize_units(bytect):
    """Helper function to convert bytes into base-2 units"""
    for units in [(2**50, "PiB"), (2**40, "TiB"), (2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")]:
        if bytect >= units[0]:
            return bytect / units[0], units[1]

    return bytect, "bytes" if bytect != 1 else "byte"

def humanize_units_generic_base10(count):
    """Helper function to convert counts into base-10 units"""
    for units in [(10.0**12, "T"), (10.0**9, "B"), (10.0**6, "M"), (10.0**3, "K")]:
        if count >= units[0]:
            return count / units[0], units[1]

    return count, ""

In [None]:
def invalid_minmax_df(df, key):
    """Ensures that min/max ranges are going in the right direction!
    
    Makes sure that the min/max columns of a distribution dataframe are
    not larger than the other.  Should always return 0.  Used in debugging.
    """
    return (~(df[key + '_min'] <= df[key + '_max'])).sum()

def invert_redux(redux):
    """Inverts min/max
    
    Used for when a min/max value has an inverse relationship
    with another min/max value
    """
    if redux == "min":
        return "max"
    elif redux == "max":
        return "min"
    return redux

In [None]:
# Either read a cached version of the file size distribution, or recalculate and cache it
cached_histogram = INPUT_DB_FILE.replace('.sqlite', '_hist.csv')
if os.path.isfile(cached_histogram):
    print("Reading cached histogram from %s" % cached_histogram)
    reference_df = pandas.read_csv(cached_histogram, index_col='bin_size')
else:
    conn = sqlite3.connect(INPUT_DB_FILE)
    print("Generating histogram from %s" % INPUT_DB_FILE)
    reference_df = histogram.histogram_dataframe(conn, INODE_TYPES)
    conn.close()
    print("Writing cached histogram to %s" % cached_histogram)
    reference_df.to_csv(cached_histogram)

In [None]:
REDUCTIONS = ('min', 'max', 'ave')
reference_df['bin_extent_min'] = numpy.concatenate((numpy.array([0, 2]), reference_df.index[1:-1].values + 1))
reference_df['bin_extent_max'] = reference_df.index.values
reference_df['bin_extent_ave'] = (reference_df['bin_extent_min'] + reference_df['bin_extent_max']) / 2

In [None]:
for itype in reference_df:
    if itype.startswith('num_'):
        print("%20s: %12d inodes, %8.2f %s" % (itype,
                                            reference_df[itype].sum(),
                                            *(humanize_units((reference_df[itype] * reference_df['bin_extent_ave']).sum()))))

## Calculate the mass distributions of data and metadata

In [None]:
def calculate_inode_mass(input_df, redux, minmax=None):
    suffix = ""
    if minmax:
        suffix = "_" + minmax
    # Assume all files have the same inode mass - LUSTRE_BYTES_PER_INODE
    ret = input_df['num_files' + suffix] * LUSTRE_BYTES_PER_INODE
    
    # Now calculate the non-file inode masses with the constraint that all
    # inodes must be _at least_ LUSTRE_BYTES_PER_INODE large
    for itype in NON_FILE_INODE_TYPES:
        _tmp_df = (input_df['num_%s%s' % (itype, suffix)] * reference_df['bin_extent_%s' % redux]).to_frame()
        _tmp_df['_'] = (input_df['num_%s%s' % (itype, suffix)] * LUSTRE_BYTES_PER_INODE)
        _tmp_df.max(axis=1)
        ret += _tmp_df.max(axis=1)
    
    return ret.fillna(0.0).sum()

In [None]:
mass_dist = pandas.DataFrame(index=reference_df.index)

for redux in REDUCTIONS:
    # calculate data mass of each bin
    mass_dist['dmass_%s' % redux] = (reference_df['num_files'] * reference_df['bin_extent_%s' % redux]).fillna(0.0)

mass_dist.head() * TO_GIB

## Calculate mass probability distributions

* $P_{dmass}$ is the mass distribution of file data
* $P_{imass}$ is the mass distribution of metadata

In [None]:
# Calculate absolute mass distribution
prob_dist = pandas.DataFrame(index=reference_df.index)

# estimate the total file system mass from the estimated mass-per-bin
mass_dist_sums = {'dmass_%s' % redux: mass_dist['dmass_%s' % redux].sum() for redux in REDUCTIONS}

# Calculate the min/max/average probability distributions
#   Note: the min probability dist is proportional to the max mass distribution sum
#   because the mass distribution sum is in the denominator
for redux in REDUCTIONS:
    prob_dist["dmass_" + redux] = mass_dist["dmass_" + redux] / mass_dist_sums["dmass_" + invert_redux(redux)]

prob_dist.sum()

In [None]:
# Visually check the probability distribution
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 4))
prob_dist[['dmass_ave']].plot.bar(width=1.0, edgecolor='black', ax=ax)
new_xticks = []
for idx, xtick in enumerate(zip(ax.get_xticks(), ax.get_xticklabels())):
    if idx % 6 == 0:
        new_xticks.append((xtick[0], "%d %s" % humanize_units(int(xtick[1].get_text()))))
_a, _b = zip(*new_xticks)
ax.set_xticks(_a)
ax.set_xticklabels(_b, ha='right', rotation=30)

## Calculate $M^{data}$

Calculate the mass distribution of the new file system

In [None]:
new_mass_dist = pandas.DataFrame(index=reference_df.index)

# Calculate the min/max/average probability distributions
for mass_type in 'dmass',:
    for redux in REDUCTIONS:
        new_mass_dist[mass_type + "_" + redux] = prob_dist[mass_type + '_' + redux] * TARGET_TOTAL_OST_CAPACITY

new_mass_dist.head() * TO_GIB

## Calculate $N^{file}$

Calculate the distribution of files for the new file system

In [None]:
# Convert probability distribution into a new data mass distribution
newfs_df = pandas.DataFrame(index=reference_df.index)

# From the new mass distribution, calculate the number of files
newfs_df['num_files_min'] = (new_mass_dist['dmass_min'] / reference_df['bin_extent_max']).fillna(0.0)
newfs_df['num_files_max'] = (new_mass_dist['dmass_max'] / reference_df['bin_extent_min'].apply(lambda x: max(1, x))).fillna(0.0)
newfs_df['num_files_ave'] = (new_mass_dist['dmass_ave'] / reference_df['bin_extent_ave']).fillna(0.0)

newfs_df.head()

## Calculate $N$ for all inode types

In [None]:
# From the new file distribution, calculate the number of other inode types
for itype in NON_FILE_INODE_TYPES:
    for redux in REDUCTIONS:
        newfs_df['num_%s_%s' % (itype, redux)] = (newfs_df['num_files_' + redux] * reference_df['num_' + itype] / reference_df['num_files']).fillna(0.0)

newfs_df.head()

In [None]:
# From our new inode size distribution, calculate their mass
inode_mass = {}
for redux in REDUCTIONS:
    # calculate inode mass of each bin
    inode_mass[redux] = calculate_inode_mass(newfs_df, redux, minmax=redux)

for key, val in inode_mass.items():
    print("%5s %.2f GiB" % (key, val * TO_GIB))

In [None]:
for redux in []:# REDUCTIONS:
    sum_file_count = newfs_df['num_files_' + redux].sum()
    sum_file_mass = sum_file_count * 4096

    sum_all_count = 0.0
    for itype in INODE_TYPES:
        colname = "num_%s_%s" % (itype, redux)
        sum_all_count += newfs_df[colname].sum()
    sum_all_mass = inode_mass[redux]

    sum_nonfile_count = 0.0
    for itype in NON_FILE_INODE_TYPES:
        colname = "num_%s_%s" % (itype, redux)
        sum_nonfile_count += newfs_df[colname].sum()
    sum_nonfile_mass = sum_all_mass - sum_file_mass

    print("  %3s mass required for %10d file inodes:     %5.1f %s" % (
        redux.title(),
        sum_file_count,
        *humanize_units(sum_file_mass)))
    print("                        %10d non-file inodes: %5.1f %s" % (
        sum_nonfile_count,
        *humanize_units(sum_nonfile_mass)))
    print("                        %10d total inodes:    %5.1f %s\n" % (
        sum_all_count,
        *humanize_units(sum_all_mass)))

## Calculate $C^{DOM}$

Calculate the two contributions to DOM capacity usage on MDT:

* whole small files
* the first stripe of large files

In [None]:
new_mdt = pandas.DataFrame(index=newfs_df.index)

size_on_mdt = {"first_stripe_mass_%s" % redux: [] for redux in REDUCTIONS}
size_on_mdt.update({"whole_file_mass_%s" % redux: [] for redux in REDUCTIONS})
for iloc, row in enumerate(new_mass_dist.itertuples()):
    for redux in REDUCTIONS:
        # calculate contributions of small files wholly resident on MDT
        size_on_mdt['whole_file_mass_' + redux].append(
            new_mass_dist['dmass_%s' % redux].iloc[0:iloc+1].sum())

        # calculate contributions of large files whose first stripe resides on MDT
        size_on_mdt['first_stripe_mass_' + redux].append(
            newfs_df['num_files_' + redux].iloc[iloc+1:].sum()
            * reference_df['bin_extent_max'].iloc[iloc])

for colname, elements in size_on_mdt.items():
    new_mdt[colname] = elements

for redux in REDUCTIONS:
    new_mdt['total_dom_mass_' + redux] = new_mdt['whole_file_mass_' + redux] + new_mdt['first_stripe_mass_' + redux]

new_mdt.applymap(lambda x: "%.1f %s" % humanize_units(x)).head()

## Plot the DoM capacity required for different layouts

Note that the below plots start at 4 KiB which is below the minimum Lustre stripe size of 64 KiB at the time of writing.

In [None]:
# Plot the amount of MDT capacity needed for different choices of PFL first-stripe size
# with Data-on-MDT enabled
def plot_dom_capacity(x, ymin, yave, ymax, ax=None, **kwargs):
    if not ax:
        fig, ax = matplotlib.pyplot.subplots(figsize=(8,6))
        
    if yave is not None:
        ax.plot(x, yave, ls='-', label="Average" if 'label' not in kwargs else kwargs.pop('label'), **kwargs)
    if ymin is not None:
        ax.plot(x, ymin, ls='--', label="Minimum" if 'label' not in kwargs else kwargs.pop('label'), **kwargs)
    if ymax is not None:
        ax.plot(x, ymax, ls='--', label="Maximum" if 'label' not in kwargs else kwargs.pop('label'), **kwargs)

    ax.set_xscale("log", basex=2)
    ax.set_yscale("log", basey=2)

    ax.set_xlabel("Size of first stripe on MDT ($S_0$)")
    ax.set_ylabel("MDT capacity ($C^{MDT}$)")

    xticks = [2**x for x in range(12, 26, 2)]
    ax.set_xlim(xticks[0] * 0.75, xticks[-1]*1.5)
    ax.set_xticks(xticks)
    ax.set_xticklabels(["%d %s" % humanize_units(x) for x in xticks], rotation=30, ha='right')

    yticks = [2**x for x in range(42, 53, 1)]
    ax.set_ylim(yticks[0] * 0.75, yticks[-1]*1.5)
    ax.set_yticks(yticks)
    ax.set_yticklabels(["%d %s" % humanize_units(y) for y in yticks])

    ax.grid()
    ax.set_axisbelow(True)

    ax.set_title("Scaled to a %.1f %s file system" % humanize_units(TARGET_TOTAL_OST_CAPACITY))
    
    return ax

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 5))

plot_dom_capacity(x=new_mdt.index.values,
                  ymin=new_mdt['total_dom_mass_min'].values + inode_mass['min'],
                  yave=new_mdt['total_dom_mass_ave'].values + inode_mass['ave'],
                  ymax=new_mdt['total_dom_mass_max'].values + inode_mass['max'],
                  ax=ax,
                  color='C2')

ax.fill_between(
    x=new_mdt.index.values,
    y1=new_mdt['total_dom_mass_min'].values + inode_mass['min'],
    y2=new_mdt['total_dom_mass_max'].values + inode_mass['max'],
    color='C2',
    alpha=0.25)

# handles, labels = ax.get_legend_handles_labels()
# labels[0] = "Estimated"
# labels[1] = "Min/max"
# ax.legend(handles=handles[0:2] + handles[3:5], labels=labels[0:2] + labels[3:5])
ax.legend().set_visible(False)

ax.set_title("")
ax.grid(b=True)
ax.set_axisbelow(True)

for line in ax.lines:
    if line.get_linestyle() == "--":
        line.set_visible(False)

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 5))

ymin_dom = new_mdt['total_dom_mass_min']
ymin_imass = pandas.Series(ymin_dom * 0 + inode_mass['min'], index=new_mdt.index)
ymax_dom = new_mdt['total_dom_mass_max']
ymax_imass = pandas.Series(ymax_dom * 0 + inode_mass['max'], index=new_mdt.index)

ymin = ymin_dom + ymin_imass
ymax = ymax_dom + ymax_imass
yave = new_mdt['total_dom_mass_ave'] + inode_mass['ave']

plot_dom_capacity(x=new_mdt.index.values,
                  ymin=None,# ymin,
                  yave=yave,
                  ymax=None,# ymax,
                  ax=ax,
                  color='black',
                  linewidth=1.5,
                  label="")

# Plot components of uncertainty
_delta = (inode_mass['max'] + inode_mass['min']) / 2

ax.fill_between(
    x=new_mdt.index.values,
    y1=ymin,
    y2=pandas.concat([yave - _delta, ymin], axis=1).max(axis=1),
    color='C3',
    alpha=0.25,
    label="DOM uncertainty")

ax.fill_between(
    x=new_mdt.index.values,
    y1=pandas.concat([yave - _delta, ymin], axis=1).max(axis=1),
    y2=yave,
    color='C0',
    alpha=0.25,
    label="inode uncertainty")

ax.fill_between(
    x=new_mdt.index.values,
    y1=yave,
    y2=yave + _delta,
    color='C0',
    alpha=0.25)
ax.fill_between(
    x=new_mdt.index.values,
    y1=yave + _delta,
    y2=ymax,
    color='C3',
    alpha=0.25)


ax.legend()
ax.set_title("")
ax.grid(b=True)
ax.set_axisbelow(True)

ax_plot = ax

In [None]:
output_file = 'dom_capacity_required_%s.pdf' % INPUT_DB_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

In [None]:
xticks = [2**x for x in range(10, 38, 2)]
ax_plot.set_xlim(xticks[0] * 0.75, xticks[-1]*1.5)
ax_plot.set_xticks(xticks)
ax_plot.set_xticklabels(["%d %s" % humanize_units(x) for x in xticks], rotation=30, ha='right')

yticks = [2**x for x in range(42, 57)]
ax_plot.set_ylim(yticks[0] * 0.75, yticks[-1]*1.5)
ax_plot.set_yticks(yticks)
ax_plot.set_yticklabels(["%d %s" % humanize_units(y) for y in yticks])

ax_plot.get_figure()

In [None]:
output_file = 'dom_capacity_required_extended_%s.pdf' % INPUT_DB_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

The following plot breaks out the component contributions on the log scale to help visually show how each component of uncertainty contributes to the net uncertainty shown in the above figure.

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 5))

ymin_dom = new_mdt['total_dom_mass_min']
ymin_imass = pandas.Series(ymin_dom * 0 + inode_mass['min'], index=new_mdt.index)
ymax_dom = new_mdt['total_dom_mass_max']
ymax_imass = pandas.Series(ymax_dom * 0 + inode_mass['max'], index=new_mdt.index)

ymin = ymin_dom + ymin_imass
ymax = ymax_dom + ymax_imass
yave = new_mdt['total_dom_mass_ave'] + inode_mass['ave']

plot_dom_capacity(x=new_mdt.index.values,
                  ymin=None,# ymin,
                  yave=yave,
                  ymax=None,# ymax,
                  ax=ax,
                  color='C4')

ax.plot(ymin_dom, color='C3')
ax.plot(ymax_dom, color='C3')
ax.plot(ymin_imass, color='C0')
ax.plot(ymax_imass, color='C0')

ax.legend()

ax.set_title("")
ax.grid(b=True)
ax.set_axisbelow(True)

xticks = [2**x for x in range(10, 38, 2)]
ax.set_xlim(xticks[0] * 0.75, xticks[-1]*1.5)
ax.set_xticks(xticks)
ax.set_xticklabels(["%d %s" % humanize_units(x) for x in xticks], rotation=30, ha='right')

yticks = [2**x for x in range(39, 57)]
ax.set_ylim(yticks[0] * 0.75, yticks[-1]*1.5)
ax.set_yticks(yticks)
ax.set_yticklabels(["%d %s" % humanize_units(y) for y in yticks])

pass