In [10]:
import os
import re

import django
import fs.path
import numpy as np
import pandas as pd
from django.core.exceptions import ObjectDoesNotExist
from fs.osfs import OSFS
from toolz import keyfilter

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mastspec.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

django.setup()

from plotter.models import *
from plotter_utils import modeldict

In [28]:
for spec in MSpec.objects.all():
    spec.delete()
MSpec.objects.all()

<QuerySet []>

In [4]:
input_fs = OSFS("/home/michael/Desktop/mcam_spect_data_conversion/data/mcam")
output_fs = OSFS('.')
output_image_dir = output_fs.getsyspath("static_in_pro/our_static/img")

In [5]:
metaframe = pd.read_csv(input_fs.getsyspath('Metadata-marslab.csv'))
metaframe.columns = [column.lower() for column in metaframe.columns]
# add NaNs back in so we can programmatically delete them
metaframe.replace('-', np.nan, inplace=True)
# we're turning these to ints when we ingest them,
# but python doesn't like statements like int('3.0'),
# so turn to float as an intermediate step
for column in [
    'sol', 'site', 'drive', 'rover_elevation', 'target_elevation', 'tau',
    'focal_distance', 'incidence_angle', 'emission_angle',
    'phase_angle', 'l_s', 'site', 'drive', 'lat', 'lon', 'odometry'
]:
    metaframe[column] = metaframe[column].astype('float')
metaframe['ltst'] = metaframe['ltst'].astype('datetime64')

In [6]:
# lots of missing values and no spectra, skip for now
BAD_MCAMS = ['mcam13523']
overlay_images = [
    image for image in input_fs.listdir('DCS_ROI_images')
]
rgb_images = [
    image for image in input_fs.listdir('RGB_images')
]

In [7]:
def get_image_ordinal(mastcam_image_fn_no_ext):
    ordinal_appendage = re.search(r'_[RL](\d)$', mastcam_image_fn_no_ext)
    if ordinal_appendage:
        return ordinal_appendage.group(1)
    else:
        return '1'

In [8]:
# make our temporary dict of 'shared' observation data
observations = {}
for ix, row in metaframe.iterrows():
    # skip observations we think are 'bad'
    if row['seq_id'] in BAD_MCAMS:
        continue
    # drop NaN-valued fields and populate observation SQL fields
    # from CSV fields
    row.dropna(inplace=True)
    obs = dict(zip(row.index,row.values))
    # this is the canonical prefix for image / spectra files 
    obs_identifier = 'sol' + format(
            int(row['sol']), "0>4d"
        ) + '_' + row['seq_id']
    overlay_image_list = [
        image for image in overlay_images 
        if image.startswith(obs_identifier)
    ]
    rgb_image_list = [
        image for image in rgb_images 
        if image.startswith(obs_identifier)
    ]
    # associate observation with images using the convoluted decision tree
    # that appears to have been used to name the images (usually)
    for image in overlay_image_list:
        basename = fs.path.splitext(image)[0]
        if re.search(r'_R\d.*?_ROIs', basename):
            image_eye = 'righteye'
        elif re.search(r'_L\d.*?_ROIs', basename):
            image_eye = 'lefteye'
        # note that we _want_ this to throw a NameError if image_eye is undefined 
        obs[image_eye + '_roi_image_' + get_image_ordinal(basename)] = image
        
    # note subtle, delicious differences in RGB image naming conventions
    for image in rgb_image_list:
        basename = fs.path.splitext(image)[0]
        if re.search(r'R(_R\d)?$', basename):
            image_eye = 'righteye'
        elif re.search(r'L(_[LR]\d)?$', basename):
            image_eye = 'lefteye'
        obs[image_eye + '_rgb_image_' + get_image_ordinal(basename)] = image

    observations[row['seq_id']] = obs
observations = pd.DataFrame(observations).T
observations

Unnamed: 0,sol,seq_id,name,rover_elevation,target_elevation,tau,ltst,focal_distance,incidence_angle,emission_angle,...,righteye_rgb_image_3,righteye_roi_image_8,righteye_roi_image_6,righteye_roi_image_7,righteye_rgb_image_5,righteye_rgb_image_7,righteye_rgb_image_6,righteye_rgb_image_8,righteye_roi_image_0,lefteye_rgb_image_4
mcam00012,13.0,mcam00012,Goulburn 2x1,-4500.97,-4502.2709,0.722,2021-04-12 13:21:56,6.882,24.7586,62.2825,...,,,,,,,,,,
mcam00014,13.0,mcam00014,Dunes+Mound 1x2,-4500.97,-4496.505,0.722,2021-04-12 13:30:30,42.572,26.5491,93.4542,...,,,,,,,,,,
mcam00119,24.0,mcam00119,Clast Survey,-4502.61,-4503.1598,0.722,2021-04-12 15:32:24,3.574,54.2805,46.7285,...,,,,,,,,,,
mcam00121,25.0,mcam00121,Fractures 2x2,-4502.61,-4500.6437,0.722,2021-04-12 12:32:08,3.574,13.8772,91.0629,...,,,,,,,,,,
mcam00126,25.0,mcam00126,Hepburn (distant),-4502.61,,0.722,2021-04-12 13:01:29,,19.0881,89.4421,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mcam14177,2705.0,mcam14177,Edinburgh Stereo,,,,NaT,,,,...,,,,,,,,,,
mcam14191,2710.0,mcam14191,Eshaness Stereo,,,,NaT,,,,...,,,,,,,,,,
mcam14203,2712.0,mcam14203,Edinburgh Stereo,-4088.69,,,2021-04-12 12:55:20,,24.9819,,...,,,,,,,,,,
mcam14264,2726.0,mcam14264,Edinburgh Dump Pile,,,,NaT,,,,...,,,,,,,,,,


In [17]:
seq_id

'mcam12067'

In [None]:
SEQ_ID_PATTERN = r"mcam\d+(?=_)"
spec_files = [
    file for file in input_fs.listdir('') if (
        file.endswith('spectra-marslab.csv') and file.startswith('sol'))
]
for spec_file in spec_files:
    # extract sequence id from filename and associate it with rows of the conglomerate
    # metadata df
    seq_id = re.search(SEQ_ID_PATTERN, spec_file).group()
    try:
        observation = observations.loc[seq_id].dropna().to_dict()
    except KeyError:
        print("no observation for " + spec_file, seq_id)
        continue
    # split out observation fields that aren't image filenames...
    obs_metadata = keyfilter(lambda k: '_image_' not in k, observation)
    # ...and then also pick images probably associated with these ROIs
    image_number = get_image_ordinal(spec_file)
    images = keyfilter(
            lambda k: ('_image_' in k) and (k.endswith(image_number)), observation
        )
    # read in ROI file itself and reformat it for multidex's needs
    frame = pd.read_csv(input_fs.getsyspath(spec_file)).drop("INSTRUMENT", axis=1)
    frame.columns = [column.lower() for column in frame.columns]
    for _, row in frame.iterrows():
        row = row.replace(['-','',' '], np.nan).dropna()
        # double check that seq id and sol are the same in metadata df and marslab file
        # (if not it likely implies a malformatted or misnamed file)
        assert row['sol'] == obs_metadata['sol']
        assert row['seq_id'] == obs_metadata['seq_id']
        # if there are missing filters anywhere in the column, including for other
        # spectra, pandas will read the column
        # as object / string, which will cause confusion when we
        # compute averaged filters, so we do it in this awkward way
        for filt in MSpec.filters:
            if filt in row.index:
                row[filt] = float(row[filt])
        if row['float'] == 'Y':
            row['float'] = True
        else:
            row['float'] = False
        # conglomerate all fields, add filename & stringified image dict                        
        spectrum_dict = dict(row) | obs_metadata | {
            'filename': spec_file, 'images': str(images)
        }
        # put it in the database
        spectrum = MSpec(**spectrum_dict)
        spectrum.clean()
        spectrum.save()

In [None]:
obs_metadata

In [None]:
images