# m3 document collection workspace

This is a very simple notebook to:
1. write versions of the calibration products from the PDS3 archive
2. move other document collection items to their correct locations
and check matches between lids and filenames

In [None]:
import datetime as dt
import os
from collections import namedtuple

import pandas as pd
import sh

from m3_bulk import crude_time_log
from converter import place_label, enforce_name_match
from converter_utils import name_root, eqloc

In [None]:
#don't bother doing this multiple times if you're running this concurrently
# sh.s3fs(
#     'mc-al-khwarizmi',
#     './remote/m3_input'
# )
# sh.s3fs(
#     'mc-al-khwarizmi-m3-output', 
#     './remote/m3_output'
# )

In [None]:
# mappings for all document collection files
from src.m3_conversion import M3FlatFieldWriter, M3PipelineLogWriter, \
    M3BDEWriter, M3SSCWriter

doc_path_df = pd.read_csv('./directories/m3/m3_document_mappings.csv')


# all our 'oddball' one-off label files
label_files = []
for root, dirs, files in os.walk('./labels/m3/'):
    for file in files:
        label_files.append(os.path.join(root, file))
label_df = pd.DataFrame([
        {'local_path':file,'filename':name_root(file)}
        for file in label_files
    ])

# some of them have locally-converted or edited versions (PDF compliance, etc.)
# note: EARTH_VIEW_IMAGE isn't included in the github repo because it's large,
# but it's just a simple conversion using rasterio and converter_utils.fitsify()
oddball_files = []
for root, dirs, files in os.walk('./converted_oddballs/m3/'):
    for file in files:
        oddball_files.append(os.path.join(root, file))
oddball_df = pd.DataFrame([
        {
            'local_path':file,
            'filename':name_root(file)
        }
        for file in oddball_files
    ])

In [None]:
# where are our bundles mounted?
input_dir = '/home/ubuntu/m3_input'
output_dir = '/home/ubuntu/m3_output'

In [None]:
# sanity check: do we have an individual label for each thing we think needs one?
missing_labels = [
    file for file in eqloc(doc_path_df, 'label_type', 'label')['root'].values
    if file not in list(map(name_root, label_files))
]
assert len(missing_labels) == 0

In [None]:
for product in doc_path_df.itertuples():
    if product.root == 'earth_view_image':
        continue
    doc_start_time = dt.datetime.now()
    print(product.root, product.path, product.Index)
    if product.use_local != 'False':
        # enforce match between putative filename root and local file
        try:
            product_file = eqloc(
                oddball_df, "filename", product.root
            )["local_path"].values[0]
        except IndexError:
            raise ValueError("missing local version for " + product.root)
    else:
        product_file = input_dir + product.path
    sh.mkdir('-p', output_dir + product.newpath)
    if product.label_type == 'label':
        writer = place_label(product, label_df, product_file, output_dir)
        # validate every oddball; these labels were all manually
        # generated
        validate_results = sh.validate("-t", writer['label'])
        with open("validate_dump.txt", "a") as file:
            file.write(validate_results.stdout.decode())
        print("validated successfully")
    elif product.label_type == 'template':
        if 'reduction_pipeline' in product.newpath:
            writer = M3PipelineLogWriter(product_file)
            writer.write_pds4(
                output_dir + product.newpath +"/",
                write_product_files=True
            )
        elif 'flat_field' in product.newpath:
            # note: a handful of these are missing envi headers,
            # which made rasterio choke;
            # as all their headers (within global/target) 
            # are identical, we just make copies for them
            writer = M3FlatFieldWriter(product_file)
            writer.write_pds4(
                output_dir + product.newpath + "/",
                write_product_files=True
            )
        elif 'bad_detector_element' in product.newpath:
            writer = M3BDEWriter(product_file)
            writer.write_pds4(
                output_dir + product.newpath + "/",
                write_product_files=True
            )
        elif 'smooth_shape_curve' in product.newpath:
            writer = M3SSCWriter(product_file)
            writer.write_pds4(
                output_dir + product.newpath + "/",
                write_product_files=True
            )
        # enforce match between LID, filename, and putative product
        enforce_name_match("".join(writer.PDS4_LABEL), writer.pds4_root)
        # validate only one in 10 of these
        if product.Index % 10 == 0:
            print("0-mod-10th templated document: running Validate Tool")
            validate_results = sh.validate("-t", writer.pds4_label_file)
            with open("validate_dump.txt", "a") as file:
                file.write(validate_results.stdout.decode())
            print("validated successfully")
    
    # this is just a stupid bandaid for logging
    if isinstance(product.root, str):
        root = product.root
    else:
        root = writer.pds4_root
        
    crude_time_log(
            "m3_document_conversion_log",
            namedtuple('stupid_bandaid','pds4_label_file')(product.newpath+'/'+root+'.xml'),
            str((dt.datetime.now() - doc_start_time).total_seconds())
        )
    print(
        "done with this document; total seconds " 
        + str((dt.datetime.now() - doc_start_time).total_seconds())
    )