# single-band DIM conversion handler

Top-level handler Notebook for converting single-band DIMs to PDS4.

Note: paths should be changed to reflect the actual locations of the input PDS3 products on your system.


In [None]:
import time
import warnings
from multiprocessing import Pool
from pathlib import Path

import numpy as np
import pandas as pd
from cytoolz import frequencies
from pdr.pdr import DuplicateKeyWarning

from utilz import index_breadth_first, make_edr_lidmap, print_inline
from vo_conversion import VikingDIMConverter, VikingDIMBrowseWriter

output_root = Path("/datascratch/viking/scratch_write/")

In [None]:
dim_df = pd.DataFrame(
    index_breadth_first("/datascratch/viking/vo1_vo2-m-vis-5-dim-v1.0/")
)
dim = dim_df.loc[dim_df['path'].str.contains('/m')]
dim = dim.loc[dim['path'].str.endswith('img')].reset_index(drop=True)
dim = dim.drop(
    columns=['excluded', 'directory', 'ATIME', 'CTIME', 'MTIME']
).copy()

In [None]:
# assign resolution codes and latitude bins to input products
# in order to construct the output directory tree
resmap = {
    'c': '0004',
    'e': '0016',
    'g': '0064',
    'i': '0256',
    'j': '0512',
    'k': '1024'
}
paths = dim['path'].map(lambda p: Path(p))
dim['band'] = [p.suffix[1:] for p in paths]
dim['name'] = [p.name for p in paths]
dim['stem'] = [p.stem for p in paths]
dim['dtype'] = dim['name'].str.slice(0, 1)
dim['res'] = dim['name'].str.slice(1, 2)
latsign = dim['name'].str.slice(4, 5)
lat = dim['name'].str.slice(2, 4).astype(int)
dim['lat_bin'] = (
    np.floor(lat / 10) * 10
).astype(int).astype(str).str.zfill(2) + latsign
parts = dim['path'].str.split("/", expand=True)

In [None]:
"""
there are dupes in the set, but they are all byte-level equivalent.
run the commented-out code if you would like to verify.
"""
# from hashlib import md5
# dupes = dim.loc[dim['name'].duplicated(keep=False)]
# for fn, group in dupes.groupby('name'):
#     hashes = []
#     for file in group['path']:
#         hasher = md5()
#         with open(file, 'rb') as stream:
#             hasher.update(stream.read())
#             hashes.append(hasher.hexdigest())
#     assert all_equal(hashes)
dim = dim.loc[~dim['name'].duplicated(keep='first')]

In [None]:
# index the EDRs to associate map-projected products with their source products.
EDR_ROOT = '/datascratch/viking/scratch_write/data/edr'
edr_lidmap = make_edr_lidmap(EDR_ROOT)

In [None]:
def convert_dim(row, write_browse=True):
    """
    handler function for converting a single product. constructs a VikingDIMConverter
    and uses it to write a PDS4 data product; then uses that converter to construct
    a VikingDIMBrowseWriter to write an associated browse product. Note that the
    same objects are used for the single-band DIMs and the DTMs (their formats
    are very similar).
    """
    warnings.simplefilter("ignore", category=DuplicateKeyWarning)
    warnings.simplefilter("error", category=RuntimeWarning)
    converter = VikingDIMConverter(row['path'], edr_lidmap)
    output_directory = Path(output_root, "data", "dim")
    output_directory = Path(output_directory, resmap[row['res']], row['lat_bin'])
    output_directory.mkdir(parents=True, exist_ok=True)
    converter.write_file('image', output_directory)
    converter.convert_label()
    converter.write_label(output_directory)
    if write_browse is True:
        browse = VikingDIMBrowseWriter(converter)
        browse_output_directory = Path(str(output_directory).replace("/data/", "/browse/"))
        browse_output_directory.mkdir(parents=True, exist_ok=True)
        browse.write_file("image", browse_output_directory)
        browse.convert_label()
        browse.write_label(browse_output_directory)
    return 0

In [None]:
# execute convert_dim in parallel across the input products
pool = Pool(5)
results = {}
for ix, row in dim.iterrows():
    results[ix] = pool.apply_async(convert_dim, (row, True))
pool.close()
ready = {}
while len(ready) < len(results):
    print_inline(f"{len(ready)}/{len(results)}")
    ready = [k for k, v in results.items() if v.ready()]
    time.sleep(1)    
print_inline(f"{len(ready)}/{len(results)}")

In [None]:
# retrieve error/success messages from the completed processes
final = {}
for k, v in results.items():
    try:
        final[k] = v.get()
    except KeyboardInterrupt:
        raise
    except Exception as ex:
        final[k] = ex
pool.terminate()
broken = {
    k: v for k, v in final.items() if isinstance(v, Exception)
}
# show error messages (if any) along with their frequencies of occurrence
frequencies(map(str, broken.values()))