In [48]:
import numpy as np
import pandas as pd
import json
import netCDF4 as nc
import pyarrow as pa
import pyarrow.parquet as pq
import fastparquet as fpq
import pyarrow.feather as feather
import fastavro as avro
import uuid
import os
from time import time_ns
from enum import Enum
import matplotlib.pyplot as plt

In [49]:
# Common functions and values
def make_test_meta():
    test_meta = {'uuid': str(uuid.uuid1()),
                'param1':12,
                'param2':'a_string',
                'param3': np.random.rand()*1e9,
                'param4':['alist','of','strings']}
    return test_meta
    

def make_test_data(length_of_data):
    test_data = {'time' :(np.array(range(length_of_data))*1e-9), 
                 'vals' : np.random.randn(length_of_data),
                 'volts': np.random.randn(length_of_data), 
                 'dp'   : np.random.randn(length_of_data),
                 'dr'   : np.random.randn(length_of_data)}
    return test_data

def get_col_names():
    return ['time', 'vals', 'volts', 'dp', 'dr']
def round_4(val):
    return round(val, 4)


In [52]:
# Feather methods
def FE_make_meta_groups(n_to_write):
    metadata_rows = {}
    for i in range(n_to_write):
        test_meta = json.dumps(make_test_meta())
        metadata_rows[test_meta] = i
    return json.dumps(metadata_rows)


def FE_update_schema_w_meta(length_of_data, n_to_write, table):

    row_group_mappings = FE_make_meta_groups(n_to_write)
    custom_metadata = {'row_group_meta': row_group_mappings , 'length_of_data' : str(length_of_data)}
    existing_metadata = table.schema.metadata
    merged_metadata = { **custom_metadata, **existing_metadata }
    table = table.replace_schema_metadata(merged_metadata)
    return table

def FE_write_n_to_file(length_of_data, n_to_write, fp = None):
    
    if fp is None:
        fp = 'FETestFile1.feather'
        
    if os.path.exists(fp):
        os.unlink(fp)
    
    then = time_ns()
    
#     First create the dataframe with all the experiment data compiled
    all_data = []
    for i in range(n_to_write):
        df = pd.DataFrame(make_test_data(length_of_data))       
        all_data.append(df)
    df = pd.concat(all_data, ignore_index=True)
    # Then create a pyarrow table
    table = pa.Table.from_pandas(df, preserve_index=False)
    # Then attached the metadata with the table schema
    table_w_meta = FE_update_schema_w_meta(length_of_data, n_to_write, table)
    # Then write table to file
    feather.write_feather(table_w_meta, fp)
    
    now = time_ns()
    file_size = os.path.getsize(fp)/1000 #to get in kb
    return (now-then) * 1e-9, file_size


def FE_load(source):
    then = time_ns()
    
    arrow_table = feather.read_table(source, columns=['time', 'vals','volts'])
    meta = arrow_table.schema.metadata[b'row_group_meta']
    row_group_meta = json.loads(meta)
    length_of_data = int(arrow_table.schema.metadata[b'length_of_data'])
#     print(row_group_meta)
    metadata_read = (time_ns() - then)*1e-9
    # Get a list of batches, each with size = length_of_data
    table_batches = arrow_table.to_batches(max_chunksize=length_of_data) 
    for batch in table_batches:
        data = batch.to_pandas()
#         print(data)

    columns_read = (time_ns() - then)*1e-9
    
    return metadata_read, columns_read



In [53]:
# small test to check functionality
fp = 'FETestFile1.feather'
writetime, filesize = FE_write_n_to_file(1000000, 1, fp)
metadata_read, columns_read = FE_load(fp)
# print(writetime, filesize)

print(f'time to write: {round_4(writetime)} s, time to metaread {round_4(metadata_read)} s and colread is {round_4(columns_read)} s, size is {round_4(filesize)} KB')

time to write: 0.4696 s, time to metaread 0.0578 s and colread is 0.109 s, size is 40009.234 KB


Links to documentation/resources

Objects:
- [Feather.read_table](https://arrow.apache.org/docs/python/generated/pyarrow.feather.read_table.html)
- [Table.to_batches](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_batches)
- [PyArrow Table API](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html)


Resources:
- [Feather has more stability for long term storage](https://ursalabs.org/blog/2020-feather-v2/)
