In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data = [
    ("camilo", "colombia"),
    ("maria", "colombia")
]

In [5]:
df = spark.createDataFrame(data, StructType([
    StructField("first_name", StringType(), True),
    StructField("country", StringType(), True, {'model_version': 3})
]))

In [6]:
df.show()

+----------+--------+
|first_name| country|
+----------+--------+
|    camilo|colombia|
|     maria|colombia|
+----------+--------+



In [7]:
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- country: string (nullable = true)



In [8]:
f = df.schema.fields

In [9]:
["%s (%s): %s" % (t.name, t.dataType, t.metadata) for t in f]

['first_name (StringType): {}', "country (StringType): {'model_version': 3}"]

In [10]:
df.write.mode('overwrite').parquet('tmp/people_with_metadata')

In [11]:
import pyarrow.parquet as pq

In [12]:
path = '/Users/matthewpowers/Documents/code/my_apps/pysparktestingexample/tmp/people_with_metadata/part-00000-b726eeb0-5ce2-4726-a9da-f721f62007e2-c000.snappy.parquet'

In [13]:
parquet_file = pq.read_table(path)

In [14]:
print(parquet_file)

pyarrow.Table
first_name: string
country: string


In [15]:
parquet_file.schema.field('country')

pyarrow.Field<country: string>

In [16]:
parquet_file.schema.field('country').metadata

{b'PARQUET:field_id': b'2'}

In [17]:
parquet_file.schema.field('country').metadata[b'model_version']

KeyError: b'model_version'