This notebook contains a basic workflow for generating manifest parquet files from csv files generated by our scrapers. Depending on how much RAM you have available, it may not be appropriate for really painfully huge csv files; they may require special handling to split them up.

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow as pa 
from pyarrow import parquet

In [None]:
# path you're storing csv files and manifests in
manifest_folder = Path(os.getcwd(), "node_manifests")
# size-corrected scraper output csv, relative to the manifest folder
scraper_file = 'geolunar_size_corrected.csv'
# 100000 is generally a pretty good row group size for these files as 
# a balance between loading speed and memory efficiency. If you run 
# into a situation where you really need to optimize this, you're probably
# better-off just splitting up the manifest files.
row_group_size = 100000
# name of the output manifest file you'll be creating
manifest_filename = f'{Path(scraper_file).stem.replace("_size_corrected", "")}.parquet'

In [None]:
# load scraper file into memory
df = pd.read_csv(
    Path(manifest_folder, scraper_file), 
    header = 0, 
    dtype = {'url': str, 'size': str, 'units': str},
    names = ('url', 'size', 'units')
)
# find and print missing-size entries -- these may represent
# files that weren't hit correctly by the spider/scraper, or files
# that didn't provide size info in their headers. this is usually
# harmless, but in some cases might not be.
missing = df.loc[
    np.logical_or(
        df['size'] == 'ErrorLogged',
        df['size'].isna()
    )
]
print(missing['url'].values)

In [None]:
# trim missing-size entries and units column
df = df.drop(missing.index)
df = df.drop(columns='units')
df = df.loc[df['size'] != 'ErrorLogged'].reset_index(drop=True)
df['size'] = df['size'].astype(int)
# chop off protocol string 
df['url'] = df['url'].str.replace("http://", "")
df['url'] = df['url'].str.replace("https://", "")
# split url to parts by '/'; keep the first part (domain) 
# and the last (filename) separate from the rest (url)
fn_url = df['url'].str.rsplit('/', n=1, expand=True)
domain_url = fn_url[0].str.split('/', n=1, expand=True)
# reassemble all this into the search-efficient field structure 
# used in the manifest files
df[['domain', 'url']] = domain_url
del domain_url
df['filename'] = fn_url[1]
del fn_url
df = df.reindex(columns=['domain', 'url', 'filename', 'size'])

In [None]:
# write that table to a parquet file
parquet.write_table(
    pa.Table.from_pandas(df, preserve_index=False),
    Path("node_manifests", manifest_filename),
    version='2.6',
    row_group_size=row_group_size,
    use_dictionary=['domain', 'url', 'size']
)

### STOP HERE (unless you want to combine parquet files)

In [None]:
# should you wish to concatenate multiple parquet files
tabs = [
    parquet.read_table(f'node_manifests/img_jpl_msl_{ds}.parquet')
    for ds in [
        'navcam', 
        'hazcam', 
        'mahli', 
        "mastcam", 
        "mrd",
        "etc"
    ]
]
bigtab = pa.concat_tables(tabs)
bigname = 'img_jpl_msl.parquet'
parquet.write_table(
    bigtab, 
    Path("node_manifests", bigname),
    row_group_size=row_group_size,
    version = "2.6",
    use_dictionary = ['domain', 'url', 'size']
)