In [1]:
import pandas as pd
import pyarrow as pa
from pyarrow_item_schemas import item_schema, list_of_items_schema, item, id, version, item_sub_schema

In [2]:
item_schema

id: string not null
version: string not null

In [3]:
list_of_items_schema

items: list<item: struct<id: string not null, version: string not null>> not null
  child 0, item: struct<id: string not null, version: string not null>
      child 0, id: string not null
      child 1, version: string not null

In [4]:
from pydantic import BaseModel
class Item(BaseModel):
    id: str
    version: str

raw_items = [
    Item(id=str(i), version="1")
    for i in range(4)
]
raw_items

[Item(id='0', version='1'),
 Item(id='1', version='1'),
 Item(id='2', version='1'),
 Item(id='3', version='1')]

In [5]:
df1 = pd.DataFrame([item.model_dump() for item in raw_items])
df1

Unnamed: 0,id,version
0,0,1
1,1,1
2,2,1
3,3,1


In [6]:
items1 = [item.model_dump() for item in raw_items][:2]
items2 = [item.model_dump() for item in raw_items][2:]
items1, items2

([{'id': '0', 'version': '1'}, {'id': '1', 'version': '1'}],
 [{'id': '2', 'version': '1'}, {'id': '3', 'version': '1'}])

In [7]:
import pandas as pd
df2 = pd.DataFrame(data={
    "items": [items1, items2]
})
df2

Unnamed: 0,items
0,"[{'id': '0', 'version': '1'}, {'id': '1', 'ver..."
1,"[{'id': '2', 'version': '1'}, {'id': '3', 'ver..."


In [8]:
list_of_items_schema

items: list<item: struct<id: string not null, version: string not null>> not null
  child 0, item: struct<id: string not null, version: string not null>
      child 0, id: string not null
      child 1, version: string not null

In [9]:
pa.Table.from_pandas(df1, schema=item_schema)

pyarrow.Table
id: string not null
version: string not null
----
id: [["0","1","2","3"]]
version: [["1","1","1","1"]]

In [10]:
pa.Table.from_pandas(df1, schema=item_sub_schema)

pyarrow.Table
id: string not null
----
id: [["0","1","2","3"]]

In [11]:
pa.Table.from_pandas(df2, schema=list_of_items_schema)

pyarrow.Table
items: list<item: struct<id: string not null, version: string not null>> not null
  child 0, item: struct<id: string not null, version: string not null>
      child 0, id: string not null
      child 1, version: string not null
----
items: [[    -- is_valid: all not null
    -- child 0 type: string
["0","1"]
    -- child 1 type: string
["1","1"],    -- is_valid: all not null
    -- child 0 type: string
["2","3"]
    -- child 1 type: string
["1","1"]]]

In [14]:
df2_copy = df2.copy(deep=True)
df2_copy

Unnamed: 0,items
0,"[{'id': '0', 'version': '1'}, {'id': '1', 'ver..."
1,"[{'id': '2', 'version': '1'}, {'id': '3', 'ver..."


In [15]:
df2_copy['objects'] = df2_copy.apply(lambda row: [Item.model_validate(i) for i in row['items']], axis=1)
df2_copy

Unnamed: 0,items,objects
0,"[{'id': '0', 'version': '1'}, {'id': '1', 'ver...","[id='0' version='1', id='1' version='1']"
1,"[{'id': '2', 'version': '1'}, {'id': '3', 'ver...","[id='2' version='1', id='3' version='1']"


In [16]:
df2_copy.iloc[0]['objects'][0]

Item(id='0', version='1')

In [17]:
df3 = df1.copy(deep=True)
df3

Unnamed: 0,id,version
0,0,1
1,1,1
2,2,1
3,3,1


In [29]:
df3_nulled = pd.concat([df3, pd.DataFrame([pd.Series({"id": "4", "version": None})])])
df3_nulled

Unnamed: 0,id,version
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
0,4,


In [30]:
pa.Table.from_pandas(df3_nulled, schema=item_schema)

ValueError: Field pyarrow.Field<version: string not null> was non-nullable but pandas column had 1 null values

In [33]:
items2_plus_nulled = items2 + [{"id": "4", "version": None}]
items2_plus_nulled

[{'id': '2', 'version': '1'},
 {'id': '3', 'version': '1'},
 {'id': '4', 'version': None}]

In [34]:
df2_plus_nulled = pd.DataFrame(data={
    "items": [items1, items2_plus_nulled]
})
df2_plus_nulled

Unnamed: 0,items
0,"[{'id': '0', 'version': '1'}, {'id': '1', 'ver..."
1,"[{'id': '2', 'version': '1'}, {'id': '3', 'ver..."


In [35]:
pa.Table.from_pandas(df2_plus_nulled, schema=list_of_items_schema)

pyarrow.Table
items: list<item: struct<id: string not null, version: string not null>> not null
  child 0, item: struct<id: string not null, version: string not null>
      child 0, id: string not null
      child 1, version: string not null
----
items: [[    -- is_valid: all not null
    -- child 0 type: string
["0","1"]
    -- child 1 type: string
["1","1"],    -- is_valid: all not null
    -- child 0 type: string
["2","3","4"]
    -- child 1 type: string
["1","1",null]]]

In [42]:
df2_plus_nulled.iloc[1]['items']

[{'id': '2', 'version': '1'},
 {'id': '3', 'version': '1'},
 {'id': '4', 'version': None}]

In [43]:
df3["another_column"] = "foo"

In [44]:
df3

Unnamed: 0,id,version,another_column
0,0,1,foo
1,1,1,foo
2,2,1,foo
3,3,1,foo


In [45]:
pa.Table.from_pandas(df3, schema=item_schema)

pyarrow.Table
id: string not null
version: string not null
----
id: [["0","1","2","3"]]
version: [["1","1","1","1"]]

# number schema

In [48]:
int_id = pa.field('id', pa.uint64(), nullable=False)
int_version = pa.field('version', pa.uint32(), nullable=False)
int_item_schema = pa.schema([int_id, int_version])

In [63]:
df1_int = df1.copy(deep=True)
df1_int['id'] = df1_int['id'].astype('int32')
df1_int['version'] = df1_int['version'].astype('uint32')

In [70]:
out_table = pa.Table.from_pandas(df1_int, schema=int_item_schema)
out_table

pyarrow.Table
id: uint64 not null
version: uint32 not null
----
id: [[0,1,2,3]]
version: [[1,1,1,1]]

In [71]:
df1_int.dtypes

id          int32
version    uint32
dtype: object