# inode Size Histograms

Plot the size and mass distribution of inodes of different types.

In [None]:
%matplotlib inline

In [None]:
import os
import sqlite3

import matplotlib
matplotlib.rcParams['font.size'] = 16
import matplotlib.pyplot
import pandas
import numpy
import fsanalysis.histogram as histogram

TO_BYTE = 1
TO_KIB = 2**(-10)
TO_MIB = 2**(-20)
TO_GIB = 2**(-30)
TO_TIB = 2**(-40)
TO_PIB = 2**(-50)

Note that `INPUT_DB_FILES['used_capacity']` below is defined as the result of, e.g.,

    $ sqlite3 cscratch_20181109_sizebytype.sqlite "SELECT SUM(size) FROM files"
  
They are hard-coded here because the actual `*_sizeytype.sqlite` files are large and may not be available from the system on which Jupyter is running.

In [None]:
# used_capacity below is from executing `SELECT sum(size) FROM entries`
INPUT_DB_FILES = {
    'cscratch_20181109': {
        'filename': 'datasets/cscratch_20181109_sizebytype.sqlite',
        'used_capacity': 22880138323554001,
    },
    'cscratch_20190115': {
        'filename': 'datasets/cscratch_20190115_sizebytype.sqlite',
        'used_capacity': 24794479198206006,
    },
}

# Types of inodes in our dataframe
INODE_TYPES = ['files', 'dirs', 'symlinks', 'blks', 'chrs', 'fifos', 'socks']
NON_FILE_INODE_TYPES = INODE_TYPES[1:]
DECODER_RING = {
    "cscratch": "cscratch/Lustre",
    "cscratch_20181109": "cscratch/Lustre (Nov 2018)",
    "cscratch_20190115": "cscratch/Lustre (Jan 2019)",
}

ALPHA=1.0 # how transparent to make each file system's color in the plots

In [None]:
def humanize_units(bytect):
    """Helper function to convert bytes into base-2 units"""
    for units in [(2**50, "PiB"), (2**40, "TiB"), (2**30, "GiB"), (2**20, "MiB"), (2**10, "KiB")]:
        if abs(bytect) >= units[0]:
            return bytect / units[0], units[1]

    return bytect, "bytes" if bytect != 1 else "byte"

def humanize_units_generic_base10(count, long=False):
    """Helper function to convert counts into base-10 units"""
    for units in [(10.0**12, "T", "trillion"), (10.0**9, "B", "billion"), (10.0**6, "M", "million"), (10.0**3, "K", "thousand")]:
        if abs(count) >= units[0]:
            if long:
                return count / units[0], units[2]
            else:
                return count / units[0], units[1]

    return count, ""

In [None]:
dataframes = {}
for fsname, config in INPUT_DB_FILES.items():
    # Either read a cached version of the file size distribution, or recalculate and cache it
    cached_histogram = config['filename'].replace('.sqlite', '_hist.csv')
    if os.path.isfile(cached_histogram):
        print("Reading cached histogram from %s" % cached_histogram)
        dataframes[fsname] = pandas.read_csv(cached_histogram, index_col='bin_size')
    else:
        conn = sqlite3.connect(config['filename'])
        print("Generating histogram from %s" % config['filename'])
        dataframes[fsname] = histogram.histogram_dataframe(conn, INODE_TYPES)
        conn.close()
        print("Writing cached histogram to %s" % cached_histogram)
        reference_df.to_csv(cached_histogram)

In [None]:
fsnames = dataframes.keys()

In [None]:
# Calculate mass of each histogram bin
dict_to_df = {}
for fsname in dataframes.keys():
    dict_to_df[fsname] = (dataframes[fsname]['num_files'] * (dataframes[fsname].index.values)).copy()
inode_mass_df = pandas.DataFrame(dict_to_df)

In [None]:
# Count the number of inodes in each bin
dict_to_df = {}
for fsname in dataframes.keys():
    dict_to_df[fsname] = dataframes[fsname]['num_files']
inode_ct_df = pandas.DataFrame(dict_to_df)

## Plot file system mass distribution

This histogram includes only _file_ inodes.

In [None]:
COL = 'cscratch_20190115'
COL_DATE = COL.split('_')[1]

In [None]:
BAR_PARAMS = dict(width=1.0, edgecolor='black', color='C0', alpha=ALPHA, label=DECODER_RING[COL])

fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True)
fig.subplots_adjust(hspace=0.0, wspace=0.0)

# draw plot - inode distribution
ax = axes[0]

plot_df = (inode_ct_df / inode_ct_df.sum())
plot_df.index = ["%d %s" % humanize_units(x) for x in plot_df.index.values]

plot_params = BAR_PARAMS.copy()
plot_params.update(dict())
plot_df[COL].plot.bar(ax=ax, **plot_params)

ax.set_ylabel("Fraction\ntotal inodes")
ax.set_title("(a) File size distribution", x=0.02, y=0.85,
    ha='left', transform=ax.transAxes, backgroundcolor='#FFFFFFFF')

# draw plot - mass distribution
ax = axes[1]

# normalize to mass of each storage system
plot_df = (inode_mass_df / inode_mass_df.sum())
plot_df.index = ["%d %s" % humanize_units(x) for x in plot_df.index.values]

plot_params = BAR_PARAMS.copy()
plot_params.update(dict())
plot_df[COL].plot.bar(ax=ax, **plot_params)

# Relabel x axis
new_xticks = []
new_labels = []
min_x = None
max_x = None
for index, label in enumerate(ax.get_xticklabels()):
    if ((index+1) % 4) == 0 or index == 0:
        new_xticks.append(index)
        new_labels.append(label.get_text())
    if min_x is None or (plot_df.iloc[index].sum() > 0 and index < min_x):
        min_x = index
    if max_x is None or (plot_df.iloc[index].sum() > 0 and index > max_x):
        max_x = index

ax.set_ylabel("Fraction\ntotal capacity")
ax.set_ylim(-0.005, 0.14)

ax.set_xticks(new_xticks)
ax.set_xticklabels(new_labels, rotation=30, ha='right')
ax.set_xlabel("File size")
ax.set_xlim(min_x - 1, max_x + 2)

# set minor ticks for every bin
axes[0].xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))
axes[1].xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))


ax.set_title("(b) File mass distribution", x=0.02, y=0.85,
    ha='left', transform=ax.transAxes, backgroundcolor='#FFFFFFFF')

for ax in axes:
    ax.grid()
    ax.set_axisbelow(True)

    ax.set_ylim(-0.005, 0.1299)
    
    majtick = matplotlib.ticker.MultipleLocator(0.04)
    mintick = matplotlib.ticker.MultipleLocator(0.01)
    majtickfmt = matplotlib.ticker.FormatStrFormatter("%.2f")
    ax.yaxis.set_major_locator(majtick)
    ax.yaxis.set_minor_locator(mintick)
    ax.yaxis.set_major_formatter(majtickfmt)

In [None]:
output_file = 'cscratch_file_size_and_mass_hist_%s.pdf' % COL_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

The HPC-IODC version of the paper does not include the file size distribution, so re-plot it here.  Should probably refactor the above code a little better so there's not so much copypaste duplication, but deadlines necessitated sloppy coding.  Sorry!

In [None]:
fig, ax = matplotlib.pyplot.subplots(figsize=(8, 3))

plot_df = (inode_ct_df / inode_ct_df.sum())
plot_df.index = ["%d %s" % humanize_units(x) for x in plot_df.index.values]

plot_params = BAR_PARAMS.copy()
plot_df[COL].plot.bar(ax=ax, **plot_params)

ax.set_ylabel("Fraction\ntotal inodes")

# Relabel x axis
new_xticks = []
new_labels = []
min_x = None
max_x = None
for index, label in enumerate(ax.get_xticklabels()):
    if ((index+1) % 4) == 0 or index == 0:
        new_xticks.append(index)
        new_labels.append(label.get_text())
    if min_x is None or (plot_df.iloc[index].sum() > 0 and index < min_x):
        min_x = index
    if max_x is None or (plot_df.iloc[index].sum() > 0 and index > max_x):
        max_x = index

ax.set_ylabel("Fraction\ntotal capacity")
ax.set_ylim(-0.005, 0.14)

ax.set_xticks(new_xticks)
ax.set_xticklabels(new_labels, rotation=30, ha='right')
ax.set_xlabel("File size")
ax.set_xlim(min_x - 1, max_x + 2)

# set minor ticks for every bin
ax.xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))

ax.grid()
ax.set_axisbelow(True)

ax.set_ylim(-0.005, 0.1299)

majtick = matplotlib.ticker.MultipleLocator(0.04)
mintick = matplotlib.ticker.MultipleLocator(0.01)
majtickfmt = matplotlib.ticker.FormatStrFormatter("%.2f")
ax.yaxis.set_major_locator(majtick)
ax.yaxis.set_minor_locator(mintick)
ax.yaxis.set_major_formatter(majtickfmt)

In [None]:
output_file = 'cscratch_file_size_hist_%s.pdf' % COL_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)

## Distribution of MDT mass from non-file inodes

The following distribution shows the MDT mass required by non-file inodes.  Directories can be very large if they contain many child inodes.  The other non-file inode types are relatively uninteresting.

In [None]:
BAR_PARAMS = dict(width=1.0, edgecolor='black', alpha=ALPHA, label=DECODER_RING[COL])

label_map = {
    'num_dirs': "Directories",
    'num_symlinks': 'Symlinks',
    'num_blks': 'Block devices',
    'num_chrs': 'Character devices',
    'num_fifos': 'FIFOs',
    'num_socks': 'Sockets'
}

#fig, ax = matplotlib.pyplot.subplots(figsize=(8, 4))
plot_df = dataframes['cscratch_20190115'][[x for x in dataframes['cscratch_20190115'] if x != "num_files"]]
plot_df /= plot_df.sum().sum()
plot_df.index = ["%d %s" % humanize_units(x) for x in plot_df.index.values]
plot_df = plot_df.loc[:, (plot_df != 0).any(axis=0)] # drop all zero columns
plot_df.columns = [label_map.get(x, x) for x in plot_df.columns]

fig, axes = matplotlib.pyplot.subplots(nrows=2, ncols=1, figsize=(8, 4), sharex=True)
fig.subplots_adjust(
    hspace=0.05,
    wspace=0.0
)

# The log scale part
ax = axes[1]
plot_df.plot.bar(stacked=True, ax=ax, **BAR_PARAMS)
ax.set_yscale('log')
ax.legend().set_visible(False)

# The linear scale part
ax = axes[0]
plot_df.plot.bar(stacked=True, ax=ax, **BAR_PARAMS)
ax.set_ylim(0.1, 1)
ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(0.2))
ax.yaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(0.1))
ax.yaxis.set_major_formatter(matplotlib.ticker.FormatStrFormatter("%.2f"))

# The bottom frame (regardless of what it is)

ax = axes[1]

# Relabel x axis
if True:
    new_xticks = []
    new_labels = []
    min_x = None
    max_x = None
    for index, label in enumerate(ax.get_xticklabels()):
        if ((index+1) % 4) == 0 or index == 0:
            new_xticks.append(index)
            new_labels.append(label.get_text())
        if min_x is None or (plot_df.iloc[index].sum() > 0 and index < min_x):
            min_x = index
        if max_x is None or (plot_df.iloc[index].sum() > 0 and index > max_x):
            max_x = index

    ax.set_xticks(new_xticks)
    ax.set_xticklabels(new_labels, rotation=30, ha='right')
    ax.set_xlabel("Size")

ax.set_xlim(min_x - 1, max_x + 2)

ax.xaxis.set_minor_locator(matplotlib.ticker.MultipleLocator(1))
ax.tick_params(axis='x', which='major', length=8)

ax.set_ylim(1e-9, 0.1)

for ax in axes:
    ax.grid()
    ax.set_axisbelow(True)


fig.text(0.02, 0.5,
         "Fraction of non-file inodes",
         verticalalignment='center',
         horizontalalignment='center',
         rotation='vertical')

# draw break between axes

axes[0].spines['bottom'].set_visible(False)
axes[1].spines['top'].set_visible(False)
axes[0].xaxis.tick_top()
axes[0].tick_params(labeltop=False)
axes[1].xaxis.tick_bottom()

chop=0.015
axes[0].plot((-chop, chop), (-chop, chop), transform=axes[0].transAxes, color='k', linewidth=1.0, clip_on=False)
axes[0].plot((1-chop, 1+chop), (-chop, chop), transform=axes[0].transAxes, color='k', linewidth=1.0, clip_on=False)
axes[1].plot((-chop, chop), (1-chop, 1+chop), transform=axes[1].transAxes, color='k', linewidth=1.0, clip_on=False)
axes[1].plot((1-chop, 1+chop), (1-chop, 1+chop), transform=axes[1].transAxes, color='k', linewidth=1.0, clip_on=False)

In [None]:
# save output
output_file = 'cscratch_all_inode_hist_%s.pdf' % COL_DATE
fig.savefig(output_file, dpi=200, bbox_inches='tight', transparent=True)
print("Wrote output to", output_file)