In [2]:
import numpy as np
import pandas as pd
import json
import netCDF4 as nc
import pyarrow as pa
import pyarrow.parquet as pq
import fastparquet as fpq
import uuid
import os
from time import time_ns
from enum import Enum
import matplotlib.pyplot as plt

In [3]:
# Common functions and values
def make_test_meta():
    test_meta = {'uuid': str(uuid.uuid1()),
                'param1':12,
                'param2':'a_string',
                'param3': np.random.rand()*1e9,
                'param4':['alist','of','strings']}
    return test_meta
    

def make_test_data(length_of_data):
    test_data = {'time' :(np.array(range(length_of_data))*1e-9), 
                 'vals' : np.random.randn(length_of_data),
                 'volts': np.random.randn(length_of_data), 
                 'dp'   : np.random.randn(length_of_data),
                 'dr'   : np.random.randn(length_of_data)}
    return test_data

def get_col_names():
    return ['time', 'vals', 'volts', 'dp', 'dr']
def round_4(val):
    return round(val, 4)


In [4]:
# Parquet methods
def PQ_make_meta_groups(n_to_write):
    metadata_rows = {}
    for i in range(n_to_write):
        test_meta = json.dumps(make_test_meta())
        metadata_rows[test_meta] = i
    return json.dumps(metadata_rows)


def PQ_create_schema_w_meta(n_to_write):
    df = pd.DataFrame(make_test_data(1)) # Small dataframe to get pandas schema
    table = pa.Table.from_pandas(df, preserve_index=False)
    row_group_mappings = PQ_make_meta_groups(n_to_write)
    custom_metadata = {'row_group_meta': row_group_mappings}
    existing_metadata = table.schema.metadata
    merged_metadata = { **custom_metadata, **existing_metadata }
    table = table.replace_schema_metadata(merged_metadata)
    return table.schema

def PQ_write_n_to_file(length_of_data, n_to_write, fp = None):
    
    if fp is None:
        fp = 'PQTestFile.parquet'
        
    if os.path.exists(fp):
        os.unlink(fp)
    
    then = time_ns()
    
    my_schema = PQ_create_schema_w_meta(n_to_write)
    with pq.ParquetWriter(fp, my_schema) as writer:
        for i in range(n_to_write):
            df = pd.DataFrame(make_test_data(length_of_data))
            table = pa.Table.from_pandas(df, preserve_index=False)
            writer.write_table(table, row_group_size = length_of_data)

    now = time_ns()
    file_size = os.path.getsize(fp)/1000 #to get in kb
    return (now-then) * 1e-9, file_size


def PQ_load(source):
    then = time_ns()

    # accessing meta
    metadata_pq = pq.read_metadata(source)
    print("Row groups: ", metadata_pq.num_row_groups)
    meta = metadata_pq.metadata[b'row_group_meta']
    row_group_mappings = json.loads(meta)
    metadata_read = (time_ns() - then)*1e-9


    with pq.ParquetFile(source) as parquet_file:
        for row_num in row_group_mappings.values():
            table = parquet_file.read_row_group(row_num, columns=['time', 'vals','volts']).to_pandas()

    columns_read = (time_ns() - then)*1e-9 

    return metadata_read, columns_read



In [5]:
# small test to check functionality
fp = 'PQTestFile.parquet'
writetime, filesize = PQ_write_n_to_file(20000, 5, fp)
metadata_read, columns_read = PQ_load(fp)
# print(writetime, filesize)

print(f'time to write: {round_4(writetime)} s, time to metaread {round_4(metadata_read)} s and colread is {round_4(columns_read)} s, size is {round_4(filesize)} KB')

Row groups:  5
time to write: 0.1738 s, time to metaread 0.0156 s and colread is 0.0313 s, size is 4911.615 KB


In [8]:
# FastParquet methods
def FPQ_make_meta_groups(n_to_write):
    metadata_rows = {}
    for i in range(n_to_write):
        test_meta = json.dumps(make_test_meta())
        metadata_rows[test_meta] = i
    return json.dumps(metadata_rows)


def FPQ_create_schema_w_meta(n_to_write):
    df = pd.DataFrame(make_test_data(1)) # Small dataframe to get pandas schema
    table = pa.Table.from_pandas(df, preserve_index=False)
    row_group_mappings = PQ_make_meta_groups(n_to_write)
    custom_metadata = {'row_group_meta': row_group_mappings}
    existing_metadata = table.schema.metadata
    merged_metadata = { **custom_metadata, **existing_metadata }
    table = table.replace_schema_metadata(merged_metadata)
    return table.schema

def FPQ_write_n_to_file(length_of_data, n_to_write, fp = None):
    
    if fp is None:
        fp = 'FPQTestFile1.parq'
        
    if os.path.exists(fp):
        os.unlink(fp)
    
    then = time_ns()
    row_group_mappings = FPQ_make_meta_groups(n_to_write)
    custom_metadata = {'row_group_meta': row_group_mappings}
    all_data = []
    
    for i in range(n_to_write):
        tes = make_test_data(length_of_data)
        df = pd.DataFrame(tes)
        all_data.append(df)
#         df = pd.DataFrame(tes)
#         table = pa.Table.from_pandas(df, preserve_index=False)
#         print(i, table)
        
#         append = False if i == 0 else True
#         if i == 0:
#         fpq.write(fp, df, row_group_offsets = length_of_data, append =append, custom_metadata = custom_metadata)
#         else:
#             file = fpq.ParquetFile(fp)
#             file.write_row_groups(df, row_group_offsets = length_of_data)
#         writer.write_table(table, row_group_size = length_of_data) , append =append, custom_metadata = custom_metadata 
    
#     my_schema = PQ_create_schema_w_meta(n_to_write)
#     with pq.ParquetWriter(fp, my_schema) as writer:
#         for i in range(n_to_write):
#             df = pd.DataFrame(make_test_data(length_of_data))
#             table = pa.Table.from_pandas(df, preserve_index=False)
#             writer.write_table(table, row_group_size = length_of_data)

    df = pd.concat(all_data)
#     print(df)
#     table = pa.Table.from_pandas(df, preserve_index=False)
#     file = fpq.ParquetFile(fp)
    fpq.write(fp, df, row_group_offsets = length_of_data, custom_metadata = custom_metadata)
#     file.write_row_groups(df, row_group_offsets = length_of_data)
#     print(table)

    now = time_ns()
    file_size = os.path.getsize(fp)/1000 #to get in kb 
    return (now-then) * 1e-9,  file_size


def FPQ_load(source):
    then = time_ns()

    # accessing meta
#     with fpq.ParquetFile('myfile.parq') as file:
#     file = fpq.ParquetFile('myfile.parq')
    file = fpq.ParquetFile(source)
    print("Row groups: ", len(file.row_groups))
    row_group_mappings = json.loads(file.key_value_metadata['row_group_meta'])

    metadata_read = (time_ns() - then)*1e-9
    
    for rg in file.iter_row_groups(columns = ['time', 'vals']):
        table = rg
    # will generate every row group tho, you cant select just one

    columns_read = (time_ns() - then)*1e-9 

    return metadata_read, columns_read


In [9]:
fp = 'FPQTestFile1.parq'
writetime, filesize = FPQ_write_n_to_file(10, 2, fp)
metadata_read, columns_read = FPQ_load(fp)
# print(writetime, filesize)
print(f'time to write: {round_4(writetime)} s, time to metaread {round_4(metadata_read)} s and colread is {round_4(columns_read)} s, size is {round_4(filesize)} KB')

Row groups:  2
time to write: 0.0235 s, time to metaread 0.0064 s and colread is 0.0156 s, size is 3.64 KB


In [168]:
# with fpq.ParquetFile(fp) as file:
   
file = fpq.ParquetFile(fp)
# print(file.statistics.keys()) # -> a dictionary of the stats keys
# get a specific set of row groups
# index_list = [0,2,4], renters insurance and pge
# rows = [file.row_groups[i] for i in index_list]
# for rg in rows:
#     item = file.read_row_group_file(rg, columns = ['time', 'vals'], categories = None)
#     print(item)
# fpq.update_file_custom_metadata(fp, custom_metadata ={"trial" : "value"})
# print(file.key_value_metadata)
obj = json.loads(file.key_value_metadata['row_group_meta'])
# print(obj)

# filters=[('col3', 'in', [1, 2, 3, 4])])
cols = ['time', 'vals']
# new_file = file.to_pandas(columns = ['time', 'volts'] ,  filters=[('volts', 'in', [-2.23])])
# print(new_file)
for index, r in enumerate(file.iter_row_groups(columns = ['time', 'vals'])):
    print("num")
    print(index, r)

num
0                time      vals
index                        
0      0.000000e+00 -2.155567
1      1.000000e-09 -1.088836
2      2.000000e-09  0.604460
3      3.000000e-09 -2.034959
4      4.000000e-09  0.367580
5      5.000000e-09 -0.721668
6      6.000000e-09 -1.632854
7      7.000000e-09 -0.630668
8      8.000000e-09 -0.309671
9      9.000000e-09  0.683247
num
1                time      vals
index                        
0      0.000000e+00  0.996050
1      1.000000e-09  0.152037
2      2.000000e-09 -0.338334
3      3.000000e-09 -2.119522
4      4.000000e-09  1.521094
5      5.000000e-09  0.099200
6      6.000000e-09 -0.044801
7      7.000000e-09  1.006903
8      8.000000e-09  0.655576
9      9.000000e-09 -0.011465


In [166]:
metadata_pq = pq.read_metadata(fp).row_group(0)
print(metadata_pq.column(1))
# metadata_pq = pq.read_metadata(fp) .column(1).statistics
# print(metadata_pq.num_row_groups)

<pyarrow._parquet.ColumnChunkMetaData object at 0x000001A6E13B0D10>
  file_offset: 387519
  file_path: 
  physical_type: DOUBLE
  num_values: 20000
  path_in_schema: vals
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x000001A6E13B0950>
      has_min_max: True
      min: -4.198500011324119
      max: 3.862155340452514
      null_count: 0
      distinct_count: 0
      num_values: 20000
      physical_type: DOUBLE
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE_DICTIONARY', 'PLAIN', 'RLE')
  has_dictionary_page: True
  dictionary_page_offset: 189865
  data_page_offset: 349897
  total_compressed_size: 197654
  total_uncompressed_size: 197636


Links to documentation/resources

Objects:
- [Parquet.FileMetaData](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.FileMetaData.html#pyarrow.parquet.FileMetaData)
- [Parquet.ParquetSchema](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetSchema.html#pyarrow.parquet.ParquetSchema)
- [Pyarrow.Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table)

Resources:
- [How to define a new schema](https://mungingdata.com/pyarrow/arbitrary-metadata-parquet-table/)
- [Merge pandas schema + custom metadata](https://stackoverflow.com/questions/52122674/how-to-write-parquet-metadata-with-pyarrow)



In [167]:
# Printing functions

'''
pq.read_metadata returns an object pq.FileMetaData from footer of a single Parquet file. It contains info about num_columns,
num_rows, num_row_groups. pq_meta.metadata stores additional user entered metadata, a dict that should
have the metadata we created. It has 2 keys - custom_meta (the row group mapping), pandas (pandas metadata from dataframe)
The keys/values are stored as byte strings
pq_meta.schema will store the a pq.ParquetSchema object with the names of all the fields(columns)
'''

def print_metadata_(source):
    pq_meta = pq.read_metadata(source)
    print("FileMetaData: ", pq_meta)
    print("Schema: ", pq_meta.schema)
    
    our_meta = pq_meta.metadata
    print("Our meta keys: ",our_meta.keys())
    
    row_group_mappings = json.loads(our_meta[b'custom_meta'])
    print("Row mapping: ", row_group_mappings)
    
    return

'''

metadata_pq.row_group(int) returns a pq.RowGroupMetaData object which stores num_columns/rows and size
To access info about a specific column row_group_meta.column(<col no.>) this gives you data type, total size compr/uncompr
and stats which has min/max values 

'''

def print_row_group_meta(source):
    metadata_pq = pq.read_metadata(source)
    num_row_groups = metadata_pq.num_row_groups
    
    for i in range(num_row_groups):
        row_group_meta = metadata_pq.row_group(i)
        print(f"All row {i} group meta ", row_group_meta)
        
        col_data = row_group_meta.column(1)
        name = col_data.path_in_schema
        print(f"Column data for {name}: ", col_data)
        
        print(f"Stats for {name} row group {i}",col_data.statistics)
    
    return
        
    
   
'''
pq.read_table returns a Pyarrow.Table object, an optimized data structure developed by Apache Arrow
.the entire table with all row groups and columns. You can specify
the columns you want to get a subset of cols and apply filters to get a subset of rows. It has attributes like
column_names, num_rows/columns, schema. table.schema has the field names and types. table.schema.metadata is where the metadata 
we created is stored in a dict. You can also read the table as a pandas dataframe.

'''
def print_table(source):
    
    table = pq.read_table(source, columns=['time'])
    print("pyarrow table: ",table )
    print("pyarrow table: ",table.schema.metadata )
    row_group_mappings = json.loads(table.schema.metadata[b'custom_meta'])
    
    
    pandas_table = table.to_pandas()
    print("pandas table: ", pandas_table)
    
    return


    
