# snippets for using PDS Validate Tool from Jupyter

This is a goodie, not a core part of the conversion process.
Building Validate Tool directly into the Clementine conversion runs
is impractical because it can take up to a minute to spin up the
Java VM and load dictionaries; with the high file volume of Clementine,
this could add weeks or months to conversion. These are some little techniques
for running Validate Tool and viewing its output from Jupyter. These have
not been rigorously tested or verified, but you might like them.

Note that this all relies on having a link to the validate execution script,
or the script itself, in your path. Validate Tool is not included in this
bundle; get it from https://nasa-pds.github.io/validate/.

Note that Validate Tool is pretty resource-intensive; you can easily
overwhelm your system by getting too enthusiastic about running
a lot of instances of it at once. You might want to consider cranking
the starting memory of its VM down by changing the ```-Xms2048m```
parameter in its execution script,  because it definitely doesn't need
that much memory for the Clementine files, especially the EDR.

In [None]:
import fs
from fs.osfs import OSFS
from more_itertools import split_at
import sh
from toolz import valfilter

from clem_bulk import BgViewer

In [None]:
output_root = '/home/ubuntu/data_temp/data'
output_fs = OSFS(output_root)

In [None]:
# several 'flavors'

def impure_background_validator(target, value, threshold, structure):
    if value <= threshold:
        return
    structure.append(
        BgViewer(sh.validate("-t", target, _bg=True))
    )

def background_validator(target):
    return BgViewer(sh.validate("-t", target, _bg=True))

def background_no_data_validator(target):
    return BgViewer(sh.validate(
        "-t", target, '-D', _bg=True
    ))

def background_spot_validator(target, max_e=20, spot_freq=10):
    """spot-checks data. also highly sensitive to failures."""
    return BgViewer(sh.validate(
        "-t", target, max_errors=max_e, spot_check_data=spot_freq, _bg=True
    ))

In [None]:
root = '/edr/uvvis/0750/'
validators = {}
for ix, lat_bin in enumerate(output_fs.scandir(root)):
    path = output_fs.getsyspath(
            fs.path.combine(root, lat_bin.name)
        )
    if ix < 6:
        continue
    if ix > 10:
        continue
    validators[path] = background_spot_validator(path, 5)

In [None]:
validators = {}
for lat_bin in output_fs.scandir('/data/lwir'):
    path = output_fs.getsyspath(
            fs.path.combine('/data/lwir', lat_bin.name)
        )
    validators[path] = background_spot_validator(path, 5)

In [None]:
# or, just label validation:
validators = {}
for dataset in output_fs.scandir('/data/'):
    path = output_fs.getsyspath(
            fs.path.combine('/data/', dataset.name)
        )
    validators[path] = background_no_data_validator(path)

In [None]:
if not any([
    validator.running_command.is_alive()
    for validator in validators.values()
]):
    print("we're all done")

In [None]:
len(validators)

In [None]:
len([
    validator for validator in validators.values()
    if not validator.running_command.is_alive()
])

In [None]:
# will block if you run while any validators are still going
validator_output = {
    directory:validator.running_command.stdout.decode()
    for directory, validator in validators.items()
}
failed_runs = valfilter(
    lambda report: 'FAIL' in report, validator_output
)
assert failed_runs == {}

In [None]:
failures = failed_runs.values()
for fail in failures:
    lines = fail.splitlines()
    for ix, line in enumerate(lines):
        if 'FAIL' in line:
            print(line)
            print(lines[ix+1])
            print(lines[ix+2])

In [None]:
failed_hires = failed_runs['/home/ubuntu/buckets/clem_output/data/hires']

In [None]:
failed_hires = map("\n".join,
    split_at(
        failed_hires.splitlines(),
        lambda report_line: "completed" in report_line,
        keep_separator=True
    )
)

In [None]:
with open("mosaic_label_validator_output.txt", "w") as file:
    file.write("\n".join(list(validator_output.values())))