In [None]:
%matplotlib inline

In [None]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
import tokio
import tokio.tools
import tokio.config
matplotlib.rcParams.update({'font.size': 14})

## Define input parameters

To generate a Lustre activity heat map, you must define the start time, end time, and file system of interest.  Valid file systems to use in the `file_systems` variable are printed by the following cell.

In [None]:
print("Valid file systems are: " + ", ".join(tokio.config.CONFIG['hdf5_files'].keys()))

Define the start time, end time, and file system name below.

In [None]:
start_time = datetime.datetime(2017, 10, 17, 20, 26, 6)
end_time = datetime.datetime(2017, 10, 17, 21, 56, 14)
file_system = 'cscratch'

print("Start time: %s" % start_time)
print("End time:   %s" % end_time)

## Generate heat maps

You shouldn't have to modify any code below here.

In [None]:
plot_datasets = [
    (tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/datatargets/readbytes',
        datetime_start=start_time,
        datetime_end=end_time) / 2.0**30, "Read Rate"),
    (tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/datatargets/writebytes',
        datetime_start=start_time,
        datetime_end=end_time) / 2.0**30, "Write Rate"),
]
plot_datasets_cpu = [
    (tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/dataservers/cpuload',
        datetime_start=start_time,
        datetime_end=end_time), "OSS CPU Load"),
    (tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/mdservers/cpuload',
        datetime_start=start_time,
        datetime_end=end_time), "MDS CPU Load")
]

In [None]:
def plot_dataframe(plot_df, xlabel="x", ylabel="y", zlabel="z", title=None):
    plot_aspect = max(1.0, float(len(plot_df.transpose().index))/float(len(plot_df.transpose().columns)))
    
    fig = plt.figure()
    fig.set_size_inches(10,8)
    fig.subplots_adjust(hspace=0.0, wspace=0.0)
    if title:
        fig.suptitle(title)

    gridspec = matplotlib.gridspec.GridSpec(2, 2, width_ratios=[4,1], height_ratios=[3,1])

    ### generate heatmap
    ax_hm = fig.add_subplot(gridspec[0])

    ### alternate sorting - sort by standard deviation to highlight bad OSTs
    x = [ timestamp.to_pydatetime() for timestamp in plot_df.index ]
    y = range(len(plot_df.columns))
    z = plot_df[plot_df.columns[y]].T
    
    ### generate the heatmap
    cs = ax_hm.pcolormesh(
            x,
            y,
            z,
            cmap="hot")

    ### force plot to only show the range of data we're plotting and nothing more
    ax_hm.set_ylim(0, len(plot_df.columns) - 1)

    ### build tick labels for y axis
    labels = []
    for label_idx in [int(i) for i in ax_hm.get_yticks()]:
        if label_idx < len(plot_df.columns):
            labels.append(plot_df.columns[label_idx])
        else:
            labels.append("")
    ax_hm.set_yticklabels(labels)
#   ax_hm.set_yticks(range(len(plot_df.columns)))
#   ax_hm.set_yticklabels(plot_df.columns)
#   ax_hm.set_xticklabels("")

    ### add the colorbar
    ax_cbar = fig.add_subplot(gridspec[1])
    cbar = plt.colorbar(cs, cax=ax_cbar)
    ax_cbar.set_ylabel(zlabel)

    ### plot the sum over all OSTs for each timestep
    ax_line = fig.add_subplot(gridspec[2])
    y = plot_df.sum(axis=1)
    ax_line.plot(y)
    ax_line.grid()

    ### lock the xticks of both plots together
    ax_line.set_xlim([x[0], x[-1]])
    ax_hm.set_xlim([x[0], x[-1]])
    ax_line.set_xticks( ax_hm.get_xticks() )
    
    ### set time format
    ax_line.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%H:%M'))
    ax_line.set_ylabel(ylabel,
                       rotation='horizontal',
                       fontsize=14,
                       horizontalalignment='right',
                       verticalalignment='top')
    ax_line.set_xlabel(xlabel)
    fig.autofmt_xdate()
    return ax_hm

## Read and write rates on Lustre

The following heat maps show time on the _x_ axis, each OST in the file system on the _y_ axis, and the per-OST performance delivered as the _z_ color.  Long horizontal stripes in x indicate that a single OST was busy (usually when a file with a stripe width of 1 is being read or written), and vertical bands indicate coherent parallel I/O is happening.

In [None]:
for plot_dataset in plot_datasets:
    ax_hm = plot_dataframe(plot_dataset[0],
                   zlabel="GiB/sec",
                   ylabel="GiB/sec",
                   title=plot_dataset[1])
    print("Total %.f GiB %s" % (plot_dataset[0].sum().sum() * 5.0, plot_dataset[1].split()[0]))

## OSS and MDS CPU loads

The following heat maps show how busy the CPUs on each OSS and the primary MDS were.  Each OSS's load will range from 0 to 100.  The higher the CPU load, the more work the OSS is doing to service I/O requests.  This is not necessarily bad, but a high CPU load along with low I/O rate indicates something bad is happening on Lustre.

Note that the _y_ axis of the line graph is the sum of CPU loads which, on an absolute scale, do not have any intrinsic meaning.  Also note that the heat map on the MDS CPU heat map is absent because LMT currently only monitors a single MDS, so there is only one y data point.

In [None]:
for plot_dataset in plot_datasets_cpu:
    plot_dataframe(plot_dataset[0],
                   zlabel="% Load",
                   ylabel="% Load",
                   title=plot_dataset[1])