In [1]:
import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow as pa

## First approach to append file metadata

In [2]:
table = pv.read_csv('pets1.csv')

In [3]:
print(table.schema.metadata)

None


In [4]:
custom_metadata = {b'is_furry': b'no_fluffy', b'likes_cats': b'negative'}

In [None]:
merged_metadata = { **custom_metadata, **(table.schema.metadata or {}) }

In [None]:
fixed_table = table.replace_schema_metadata(merged_metadata)

In [None]:
pq.write_table(fixed_table, 'pets1_with_metadata.parquet')

In [None]:
parquet_table = pq.read_table('pets1_with_metadata.parquet')

In [None]:
parquet_table.schema.metadata

In [None]:
parquet_table.schema.metadata[b'is_furry']

## Make a table with a defined metadata

In [5]:
table = pv.read_csv('movies.csv')

In [6]:
table

pyarrow.Table
movie: string
release_year: int64

In [7]:
table.schema

movie: string
release_year: int64

In [8]:
my_schema = pa.schema([
    pa.field("movie", "string", False, metadata={"spanish": "pelicula"}),
    pa.field("release_year", "int64", True, metadata={"portuguese": "ano"})],
    metadata={"great_music": "reggaeton"})

In [9]:
t2 = table.cast(my_schema)

In [10]:
t2.schema

movie: string not null
  -- field metadata --
  spanish: 'pelicula'
release_year: int64
  -- field metadata --
  portuguese: 'ano'
-- schema metadata --
great_music: 'reggaeton'

In [11]:
pq.write_table(t2, 'movies.parquet')

In [12]:
s = pq.read_table('movies.parquet').schema



In [13]:
s.metadata

{b'great_music': b'reggaeton'}

In [14]:
s.metadata[b'great_music']

b'reggaeton'

In [17]:
parquet_file = pq.read_table('movies.parquet')

In [23]:
parquet_file.schema.field('release_year').metadata[b'portuguese']

b'ano'

## Second approach to append file metadata

In [None]:
table = pv.read_csv('pets2.csv')

In [None]:
table

In [None]:
table.schema

In [None]:
s2 = table.schema.with_metadata({b'say_hi': b'hola'})

In [None]:
table.schema

In [None]:
s2

In [None]:
fixed_table = table.replace_schema_metadata(table.schema)

In [None]:
pq.write_table(t, 'pets2_with_metadata.parquet')

In [None]:
pt = pq.read_table('pets2_with_metadata.parquet')

In [None]:
pt.schema.metadata

## Writing Metadata to Columns

The next part shows how to assign metadata to columns

In [None]:
table.columns

In [None]:
table.schema

## StackOverflow Question

Link: https://stackoverflow.com/questions/55546027/how-to-assign-arbitrary-metadata-to-pyarrow-table-parquet-columns

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({
        'foo' : [1, 3, 2],
        'bar' : [6, 4, 5]
        })

table = pa.Table.from_pandas(df)

In [None]:
table.schema

In [None]:
your_schema = pa.schema([
    pa.field("foo", "int64", False, metadata={"crs": "4283"}),
    pa.field("bar", "int64", True)],
    metadata={"diamond": "under_pressure"})

In [None]:
table2 = table.cast(your_schema)

In [None]:
table2.field('foo').metadata[b'crs']

In [None]:
table2.schema.metadata[b'diamond']