# Investigate Missing Data

This notebook illustrates how to explore how much data goes missing from LMT on a given day and track the daily loss over a period of time.  This is useful when H5LMT files begin reporting significant fractions of data going missing.

In [None]:
%matplotlib inline

In [None]:
import os
import datetime
import matplotlib
matplotlib.rcParams.update({'font.size': 16})
import matplotlib.pyplot as plt
import tokio
import tokio.tools

In [None]:
H5LMT_BASE = '/project/projectdirs/pma/www/daily'
DATE_OF_INTEREST = datetime.datetime(2017, 8, 21)
FILE_SYSTEM_H5LMT = 'cori_snx11168.h5lmt'

## Show missing data for one day

Determine if there is any structure to the times and OSTs which contribute to the high rate of data loss by plotting time vs. OST ID vs. missing.

In [None]:
f = tokio.connectors.Hdf5(os.path.join(H5LMT_BASE,
                          DATE_OF_INTEREST.strftime("%Y-%m-%d"),
                          FILE_SYSTEM_H5LMT))
plot_data = f['/FSMissingGroup/FSMissingDataSet'][:,:]
plot_aspect = 0.5 * (float(plot_data.shape[1]) / plot_data.shape[0])

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
fig.suptitle('Missing Data - %s on %s' %
            (os.path.basename(f.filename).split('.')[0], 
             os.path.dirname(f.filename).split(os.sep)[-1]))
heatmap = ax.matshow(plot_data,
                     cmap='hot',
                     aspect=plot_aspect)
ax.set_ylabel("OSS ID")
ax.set_xlabel("Timestep (5 seconds between timesteps)")

## Show how missing data has changed over time

Plot the daily fraction of missing data over the last few days.

In [None]:
date_end = DATE_OF_INTEREST
date_start = date_end - datetime.timedelta(days=28)

In [None]:
x = []
y = []
for h5lmt_file in tokio.tools.hdf5.enumerate_h5lmts(FILE_SYSTEM_H5LMT,
                                                    date_start,
                                                    date_end):
    f = tokio.connectors.Hdf5(h5lmt_file)
    date = datetime.datetime.strptime(
               os.path.dirname(f.filename).split(os.sep)[-1], "%Y-%m-%d").date()
    missing_data_matrix = f['FSMissingGroup/FSMissingDataSet'][:,:]
    missing = missing_data_matrix.sum()
    total = missing_data_matrix.shape[0] * missing_data_matrix.shape[1]
#   print date, float(missing)/total
    x.append(date)
    y.append(float(missing)/total)

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
ax.plot(x, y, linewidth=4.0, marker='o', markersize=8.0 )
ax.grid()
ax.set_ylabel("Fraction of data missing")
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d'))
fig.autofmt_xdate()