# m3 indexer

Just a little snippet for generating collection indices. Not
a core part of the conversion process, properly speaking, but
mildly useful.

In [1]:
from fs.osfs import OSFS
import pandas as pd

from converter_utils import eqloc, reloc
from m3_bulk import basenamer

In [2]:
# abstraction for the root directory of the bundle
bundle_fs = OSFS('/home/ubuntu/bundle')

In [3]:
# list files in bundle
out_df = pd.DataFrame(list(bundle_fs.walk.files()), columns=["path"])

In [4]:
data_files = reloc(out_df,'path',r'data/.*\.', method='match')[['path']].copy()
data_files['product_id_stem'] = data_files['path'].str.extract(r'(m3.*?l\db?)[\._]')
grouped_ids = data_files['product_id_stem'].value_counts() # inspect for sanity check

In [5]:
data_product_lidvids = [
    'urn:nasa:pds:urn:nasa:pds:ch1_m3:data:' + stem + '::1.0'
    for stem in data_files['product_id_stem'].unique()
]
# lid uniqueness check
assert len(data_product_lidvids) == len(set(data_product_lidvids))

In [6]:
# all our products are primary. also, readmes in this bundle will
# be added manually, not as part of this automated process.

collection_data_inventory = pd.DataFrame([
    {
        'type':'P', 
        'lidvid':lidvid
    } for lidvid in data_product_lidvids
])
collection_data_inventory.to_csv(
    './directories/m3/collection_data_inventory.csv', 
    header = None, 
    index = None,
    line_terminator='\r\n'
)

In [7]:
doc_df = pd.read_csv('./directories/m3/m3_document_mappings.csv')

In [8]:
doc_df['product_id'] = ''

calibrations = {
    'flat_field':'_ff',
    'bad_detector':'_bde',
    'smooth_shape':'_ssc',
    'reduction_pipeline':'_log'
}

for calibration, abbreviation in calibrations.items():
    cal_slice = reloc(doc_df,'newpath', calibration)
    doc_df.loc[
        cal_slice.index, 'product_id'
    ] = 'urn:nasa:pds:urn:nasa:pds:ch1_m3:document:' +\
        cal_slice['path'].apply(basenamer).str.lower() + abbreviation + '::1.0'

oddballs = eqloc(doc_df,'label_type','label')

doc_df.loc[
    oddballs.index, 'lidvid'
] = 'urn:nasa:pds:urn:nasa:pds:ch1_m3:document:' + oddballs['root'] + '::1.0'

In [9]:
# all our products are primary. also, explanatory documentation in this collection will
# be added to the inventory manually, not as part of this automated process.

doc_df['type'] = 'P'

collection_document_inventory = doc_df[['type', 'lidvid']].copy()

collection_document_inventory.to_csv(
    './directories/m3/collection_document_inventory.csv', 
    header = None, 
    index = None,
    line_terminator='\r\n'
)

In [18]:
# all our products are primary. also, explanatory documentation in this collection will
# be added to the inventory manually, not as part of this automated process.

s3_browse_files = reloc(out_df,'path',r'browse/.*\.xml', method='match')[['path']].copy()
s3_browse_files['product_id_stem'] = s3_browse_files['path'].str.extract(r'(m3.*?.)\.xml')
grouped_ids = s3_browse_files['product_id_stem'].value_counts() # inspect for sanity check
s3_browse_files['lidvid'] = 'urn:nasa:pds:urn:nasa:pds:ch1_m3:browse:' +\
        s3_browse_files['product_id_stem'] + '::1.0'
s3_browse_files['type'] = 'P'

In [19]:
collection_browse_inventory = s3_browse_files[['type', 'lidvid']].copy()
collection_browse_inventory.to_csv(
    './directories/m3/collection_browse_inventory.csv', 
    header = None, 
    index = None,
    line_terminator='\r\n'
)