In [None]:
import csv
import os
import re
from pathlib import Path

from astropy.io import fits as pyfits
import numpy as np
import pandas as pd
import pyarrow as pa 
from pyarrow import parquet

In [None]:
csv_manifest_file = 'node_manifests/geomro_size_corrected.csv'
# this size seems fairly optimal for this case as a balance between
# load speed and memory efficiency
row_group_size = 100000
filename = f'{Path(csv_manifest_file).stem.replace("_size_corrected", "")}.parquet'

In [None]:
df = pd.read_csv(
    csv_manifest_file, 
    header = 0, 
    dtype = {'url': str, 'size': str, 'units': str},
    names = ('url', 'size', 'units')
)
# find and print missing-size entries
missing = df.loc[
    np.logical_or(
        df['size'] == 'ErrorLogged',
        df['size'].isna()
    )
]
print(missing['url'].values)


In [None]:
# trim missing-size entries and units column
df = df.drop(missing.index)
df = df.drop(columns='units')
df = df.loc[df['size'] != 'ErrorLogged'].reset_index(drop=True)

# chop off protocol string 
df['url'] = df['url'].str.replace("http://", "")
df['url'] = df['url'].str.replace("https://", "")

# split url to parts by '/'; keep the first part (domain) 
# and the last (filename) separate from the rest (url)
fn_url = df['url'].str.rsplit('/', n=1, expand=True)
domain_url = fn_url[0].str.split('/', n=1, expand=True)
df[['domain', 'url']] = domain_url
df['filename'] = fn_url[1]
del domain_url
del fn_url
df = df.reindex(columns=['domain', 'url', 'filename', 'size'])

In [None]:
# assemble a reasonable table schema
index_schema = pa.schema([
    (column, pa.string()) 
    if column != 'size'
    else (column, pa.uint64())
    for column in df.columns 
])
arrays = {
    field.name: df[field.name].astype(str(field.type)) 
    for field in index_schema
}
# dump arrays into a pyarrow table
index_table = pa.Table.from_arrays(
    list(arrays.values()), list(arrays.keys())
)
# write that table to a parquet file
parquet.write_table(
    index_table,
    Path("node_manifests", filename),
    version='2.6',
    row_group_size=row_group_size,
    use_dictionary=['domain', 'url', 'size']
)

In [None]:
# should you wish to concatenate multiple parquet files
tabs = [
    parquet.read_table(f'img_lroc_{ds}.parquet')
    for ds in ['cdr', 'edr', 'rdr']
]
bigtab = pa.concat_tables(tabs)
bigname = 'img_lroc.parquet'
parquet.write_table(
    bigtab, 
    Path("node_manifests", bigname),
    row_group_size=row_group_size,
    version = "2.6",
    use_dictionary = ['domain', 'url', 'size']
)