In [21]:
import csv
import os
import re
from pathlib import Path

from astropy.io import fits as pyfits
import numpy as np
import pandas as pd
import pyarrow as pa 
from pyarrow import parquet

In [3]:
nav = parquet.read_table('node_manifests/img_jpl_msl_navcam.parquet')

In [4]:
mst = parquet.read_table('node_manifests/img_jpl_msl_mastcam.parquet')

In [9]:
del nav

In [8]:
mst = mst.to_pandas()

In [13]:
mst = mst.loc[mst['filename'].str.endswith('IMG')].copy()

In [15]:
mst = mst.loc[mst['url'].str.contains('/RDR/')].copy()

In [18]:
mst['filename'].str.slice(0,4).unique()

array(['0001', '0014', '0013', ..., '3066', '3061', '3067'], dtype=object)

In [30]:
mst['filename']

3223       0001ML0000000010100001C00_DRCL.IMG
3225       0001ML0000000010100001C00_DRCX.IMG
3226       0001ML0000000010100001C00_DRXX.IMG
3227       0001ML0000000010100001C00_DRLX.IMG
3228       0001ML0000000010100001I01_DRCL.IMG
                          ...                
8048744    3067MR0160220271400054C00_DRXX.IMG
8048747    3067MR0160220271400054I01_DRCL.IMG
8048748    3067MR0160220271400054I01_DRCX.IMG
8048750    3067MR0160220271400054I01_DRXX.IMG
8048751    3067MR0160220271400054I01_DRLX.IMG
Name: filename, Length: 1237422, dtype: object

In [37]:
prod = mst['filename'].str.slice(-12,-11)
proc = mst['filename'].str.slice(-8,-4)

In [38]:
combo = pd.concat([prod, proc], axis=1)

In [29]:
mst['filename'].str.slice(-8,-4).unique()

array(['DRCL', 'DRCX', 'DRXX', 'DRLX'], dtype=object)

In [22]:
csv_manifest_file = 'geomex_size_corrected.csv'
# this size seems fairly optimal for this case as a balance between
# load speed and memory efficiency
row_group_size = 100000
filename = f'{Path(csv_manifest_file).stem.replace("_size_corrected", "")}.parquet'

In [23]:
f = open(csv_manifest_file)
reader = csv.DictReader(
    f,
    fieldnames = ('url', 'size', 'units')
)
next(reader)  # skip header
lines = []

In [24]:
i = 0
for row in reader:
    del row['units']
    lines.append(row)
    i += 1
    if i % 1000000 == 0:
        print(i)

1000000
2000000
3000000
4000000
5000000
6000000
7000000


In [25]:
len(lines)

7733800

In [26]:
lines[-10:]

[{'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/software/spicam_readpds.lbl',
  'size': '4834'},
 {'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/geometry/mars/mtp019_02285_02384/spim_0au_02380a01_s_go_01.txt',
  'size': '346608'},
 {'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/geometry/mars/mtp019_02285_02384/spim_0au_02381a02_e_go_01.txt',
  'size': '588055'},
 {'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/geometry/mars/mtp019_02285_02384/spim_0au_02382a01_e_go_01.txt',
  'size': '588055'},
 {'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/geometry/mars/mtp019_02285_02384/spim_0au_02382a02_n_go_01.txt',
  'size': '1525853'},
 {'url': 'https://pds-geosciences.wustl.edu/mex/mex-y-m-spi-2-uvedr-rawxcru-mars-v2/mexspi_2100/geometry/mars/mtp019_02285_023

In [27]:
lines[30000]

{'url': 'https://pds-geosciences.wustl.edu/mex/mex-m-aspera3-2-edr-els-ext1-v1/mexasp_1101/data/els_edr_l1b_2006_01/elsscih20060210429c_accs01.csv',
 'size': '27100085'}

In [28]:
import pickle

In [30]:
 with open('temp.pkl', 'wb+') as file:
     pickle.dump(lines, file)

In [31]:
with open('temp.pkl', 'rb') as file:
    lines = pickle.load(file)

In [32]:
df = pd.DataFrame.from_dict(lines)

In [33]:
del lines

In [34]:
# df = pd.read_csv(
#     csv_manifest_file, 
#     header = 0, 
#     dtype = {'url': str, 'size': str, 'units': str},
#     names = ('url', 'size', 'units')
# )
# find and print missing-size entries
missing = df.loc[
    np.logical_or(
        df['size'] == 'ErrorLogged',
        df['size'].isna()
    )
]
print(missing['url'].values)


[]


In [35]:
len(missing)

0

In [36]:
df.to_csv("missing_mrd.csv", index=None)

In [37]:
# trim missing-size entries and units column
df = df.drop(missing.index)
# df = df.drop(columns='units')
df = df.loc[df['size'] != 'ErrorLogged'].reset_index(drop=True)
df['size'] = df['size'].astype(int)

In [38]:
# chop off protocol string 
df['url'] = df['url'].str.replace("http://", "")
df['url'] = df['url'].str.replace("https://", "")

In [39]:
df.to_csv('intermediate.csv', index=False)

In [40]:
df = pd.read_csv('intermediate.csv')

In [41]:
# split url to parts by '/'; keep the first part (domain) 
# and the last (filename) separate from the rest (url)
fn_url = df['url'].str.rsplit('/', n=1, expand=True)
domain_url = fn_url[0].str.split('/', n=1, expand=True)

In [42]:
df[['domain', 'url']] = domain_url
del domain_url
df['filename'] = fn_url[1]
del fn_url
df = df.reindex(columns=['domain', 'url', 'filename', 'size'])

In [43]:
df.to_csv('intermediate2.csv', index=False)

In [44]:
df = pd.read_csv('intermediate2.csv')

In [45]:
# assemble a reasonable table schema
index_schema = pa.schema([
    (column, pa.string()) 
    if column != 'size'
    else (column, pa.uint64())
    for column in df.columns 
])
arrays = {
    field.name: df[field.name].astype(str(field.type)) 
    for field in index_schema
}
# dump arrays into a pyarrow table
index_table = pa.Table.from_arrays(
    list(arrays.values()), list(arrays.keys())
)
# write that table to a parquet file
parquet.write_table(
    index_table,
    Path("node_manifests", filename),
    version='2.6',
    row_group_size=row_group_size,
    use_dictionary=['domain', 'url', 'size']
)

In [None]:
STOP HERE (unless you want to combine files)

In [None]:
parquet.read_table('node_manifests/img_jpl_msl_mahli.parquet')

In [3]:
# should you wish to concatenate multiple parquet files
tabs = [
    parquet.read_table(f'node_manifests/img_jpl_msl_{ds}.parquet')
    for ds in [
        'navcam', 
        'hazcam', 
        'mahli', 
        "mastcam", 
        "mrd",
        "etc"
    ]
]
bigtab = pa.concat_tables(tabs)
bigname = 'img_jpl_msl.parquet'
parquet.write_table(
    bigtab, 
    Path("node_manifests", bigname),
    row_group_size=row_group_size,
    version = "2.6",
    use_dictionary = ['domain', 'url', 'size']
)

39.385428