## Parquet with metadata (and pyarrow)

### Parquet

https://towardsdatascience.com/saving-metadata-with-dataframes-71f51f558d8e

https://towardsdatascience.com/parquet-best-practices-discover-your-data-without-loading-them-f854c57a45b6

### Arrow

https://github.com/apache/arrow

https://arrow.apache.org

https://arrow.apache.org/docs/python/


In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json

In [2]:
df = pd.DataFrame(
  { 'temp': [12.1, 11, 13, 10, 10],
    'rain': [9.2, 10.0, 2.2, 0.2, 0.4] },
    index=pd.DatetimeIndex(['2020-10-12',
                            '2020-10-13',
                            '2020-10-14',
                            '2020-10-15',
                            '2020-10-16'],
                           name='date')
)

In [3]:
df

Unnamed: 0_level_0,temp,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-12,12.1,9.2
2020-10-13,11.0,10.0
2020-10-14,13.0,2.2
2020-10-15,10.0,0.2
2020-10-16,10.0,0.4


In [4]:
custom_meta_content = {
    'user': 'M Booth',
    'coord': '55.9533° N, 3.1883° W',
    'time': '2020-10-17T03:59:59+0000'  # ISO-8601
}

In [5]:
custom_meta_content

{'user': 'M Booth',
 'coord': '55.9533° N, 3.1883° W',
 'time': '2020-10-17T03:59:59+0000'}

In [6]:
custom_meta_key = "data_info"

In [7]:
table = pa.Table.from_pandas(df)   # Question - same method available in Polars?

In [8]:
table

pyarrow.Table
temp: double
rain: double
date: timestamp[ns]
----
temp: [[12.1,11,13,10,10]]
rain: [[9.2,10,2.2,0.2,0.4]]
date: [[2020-10-12 00:00:00.000000000,2020-10-13 00:00:00.000000000,2020-10-14 00:00:00.000000000,2020-10-15 00:00:00.000000000,2020-10-16 00:00:00.000000000]]

In [9]:
table["temp"]

<pyarrow.lib.ChunkedArray object at 0x11e28b810>
[
  [
    12.1,
    11,
    13,
    10,
    10
  ]
]

In [10]:
print(table.schema.metadata)


{b'pandas': b'{"index_columns": ["date"], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "temp", "field_name": "temp", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "rain", "field_name": "rain", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "date", "field_name": "date", "pandas_type": "datetime", "numpy_type": "datetime64[ns]", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "1.5.3"}'}


In [11]:
type(table.schema.metadata)

dict

In [12]:
import pprint

In [13]:
pprint.pprint(table.schema.metadata)

{b'pandas': b'{"index_columns": ["date"], "column_indexes": [{"name": null, "f'
            b'ield_name": null, "pandas_type": "unicode", "numpy_type": "objec'
            b't", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "t'
            b'emp", "field_name": "temp", "pandas_type": "float64", "numpy_typ'
            b'e": "float64", "metadata": null}, {"name": "rain", "field_name":'
            b' "rain", "pandas_type": "float64", "numpy_type": "float64", "met'
            b'adata": null}, {"name": "date", "field_name": "date", "pandas_ty'
            b'pe": "datetime", "numpy_type": "datetime64[ns]", "metadata": nul'
            b'l}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pa'
            b'ndas_version": "1.5.3"}'}


In [14]:
def create_combined_metadata(custom_meta_content, table):
    custom_meta_json = json.dumps(custom_meta_content)
    existing_meta = table.schema.metadata
    combined_meta = {
        custom_meta_key.encode() : custom_meta_json.encode(),
        **existing_meta
    }
    return combined_meta

In [15]:
combined_meta = create_combined_metadata(custom_meta_content, table)

In [16]:
combined_meta

{b'data_info': b'{"user": "M Booth", "coord": "55.9533\\u00b0 N, 3.1883\\u00b0 W", "time": "2020-10-17T03:59:59+0000"}',
 b'pandas': b'{"index_columns": ["date"], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "temp", "field_name": "temp", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "rain", "field_name": "rain", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "date", "field_name": "date", "pandas_type": "datetime", "numpy_type": "datetime64[ns]", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "1.5.3"}'}

In [17]:
table = table.replace_schema_metadata(combined_meta)

In [18]:
print(table.schema.metadata)

{b'data_info': b'{"user": "M Booth", "coord": "55.9533\\u00b0 N, 3.1883\\u00b0 W", "time": "2020-10-17T03:59:59+0000"}', b'pandas': b'{"index_columns": ["date"], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "temp", "field_name": "temp", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "rain", "field_name": "rain", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "date", "field_name": "date", "pandas_type": "datetime", "numpy_type": "datetime64[ns]", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "1.5.3"}'}


In [19]:
pq.write_table(table, '../data/example.parquet', compression='GZIP')

In [20]:
restored_table = pq.read_table('../data/example.parquet')


In [21]:
restored_df = restored_table.to_pandas()

In [22]:
restored_df

Unnamed: 0_level_0,temp,rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-10-12,12.1,9.2
2020-10-13,11.0,10.0
2020-10-14,13.0,2.2
2020-10-15,10.0,0.2
2020-10-16,10.0,0.4


In [23]:
def read_parquet_metadata(parquet_file, custom_meta_key):
    restored_table = pq.read_table(parquet_file)
    restored_meta_json = restored_table.schema.metadata[custom_meta_key.encode()]
    return json.loads(restored_meta_json)



In [24]:
meta_data = read_parquet_metadata(parquet_file="../data/example.parquet", custom_meta_key=custom_meta_key)

In [25]:
meta_data

{'user': 'M Booth',
 'coord': '55.9533° N, 3.1883° W',
 'time': '2020-10-17T03:59:59+0000'}