# Directory Size Distribution

Plot the distribution directory sizes--that is, the number of child inodes within each directory.

In [None]:
%matplotlib inline

In [None]:
import os

import matplotlib
matplotlib.rcParams['font.size'] = 16
import matplotlib.pyplot
import pandas

## Plot directory child distribution

This plot includes _all_ types of inodes, so a directory containing 100 files is treated no differently from a directory containing 100 directories.  However only directory inodes are valid as parent inodes; thus, empty directories are truly empty directories and not files (which simply cannot have child inodes).

The input CSV can be generated by running

    ./histogram.py -t child_counts -c count cpurge01_20181109_dirdist.db

In [None]:
series = pandas.read_csv('datasets/cscratch_20181109_dirdist_hist.csv', index_col='bin_size')['num_child_counts']

In [None]:
def draw_histo(plot_df, *args, **kwargs):
    default_args = {
        "width": 1.0,
        "edgecolor": "black",
    }
    default_args.update(kwargs)
    fig, ax = matplotlib.pyplot.subplots(figsize=(8,6))
    plot_df.plot.bar(ax=ax, **default_args)
    new_xticks = []
    new_labels = []
    min_x = None
    max_x = None
    for index, label in enumerate(ax.get_xticklabels()):
        if (index % 4) == 0:
            new_xticks.append(index)
            new_labels.append(label.get_text())
        if min_x is None or (series.iloc[index] > 0 and index < min_x):
            min_x = index
        if max_x is None or (series.iloc[index] > 0 and index > max_x):
            max_x = index

    ax.set_xticks(new_xticks)
    ax.set_xticklabels(new_labels, rotation=30, ha='right')

    #ax.yaxis.grid(True)
    ax.grid()
    ax.set_axisbelow(True)

    # ax.set_yscale('log')

    ax.set_xlim(min_x - 1, max_x + 2)
    return ax

In [None]:
ax = draw_histo(series / series.sum(), color="C0")
ax.set_ylabel("Fraction of total parent inodes")
ax.set_xlabel("Number of child inodes")

In [None]:
TOTAL_DIRS = 99443114
NONEMPTY_DIRS = series.sum()

print("Total directories:     %.1f M" % (TOTAL_DIRS / 1.0e6))
print("Non-empty directories: %.1f M" % (NONEMPTY_DIRS / 1.0e6))
print("Empty directories:     %.1f M" % ((TOTAL_DIRS - NONEMPTY_DIRS) / 1.0e6))

## Analyze aggregate directory size distributions

In [None]:
inputs = {
    'cscratch': 'datasets/cscratch_20190115_dirdist_hist.csv',
}

In [None]:
input_data = {}
for fsname, inputfile in inputs.items():
    input_data[fsname] = pandas.read_csv(inputfile, index_col='bin_size')['num_child_counts']

In [None]:
plot_df = pandas.DataFrame(input_data)

ax = draw_histo(plot_df / 1e6, stacked=True)
ax.set_ylabel("Millions of directories")
ax.set_xlabel("Number of child inodes")

In [None]:
ax = draw_histo(plot_df / plot_df.sum().sum(), stacked=True)
ax.set_ylabel("Fraction of total directories")
ax.set_xlabel("Number of child inodes")