```bash
conda install pyiceberg
conda install sqlalchemy
```

In [32]:
import shutil
import os
table_dir = "iceberg_warehouse"

if os.path.exists(table_dir):
    shutil.rmtree(table_dir)

os.makedirs(table_dir, exist_ok=True)

For the sake of demonstration, we'll configure the catalog to use the SqlCatalog implementation, which will store information in a local sqlite database. We'll also configure the catalog to store data files in the local filesystem instead of an object store. This should not be used in production due to the limited scalability.

In [33]:
from pyiceberg.catalog.sql import SqlCatalog

warehouse_path = os.path.abspath("./iceberg_warehouse")
catalog = SqlCatalog(
    "default",
    **{
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [34]:
catalog.create_namespace("default")

In [35]:
import os
from pyiceberg.catalog import Catalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, LongType
from pyiceberg.table import Table
from pyiceberg.io import FileIO
import json
import shutil

# Apache Iceberg is an open table format for huge analytic datasets. It is designed to improve on the
# performance and usability of existing table formats like Hive, Hudi, and Delta Lake.

# PyIceberg is a Python library for interacting with Apache Iceberg tables.

# In this tutorial, we will understand the metadata files of Apache Iceberg using PyIceberg and local files only.


# Step 2: Create a local directory for the Iceberg table

table_dir = "iceberg_table"
os.makedirs(table_dir, exist_ok=True)

# Step 3: Initialize an Iceberg table

# Define the schema for the table
schema = Schema(
    NestedField(field_id=1, name="id", field_type=LongType(), required=False),
    NestedField(field_id=1, name="name", field_type=StringType(), required=False)
)

# Create a catalog and table
table = catalog.create_table("default.my_table", schema)


We have our first metadata file.

Now, we can write data to table. PyIceberg is nicely integrated with PyArrow. We create an Arrow table and append it to the Iceberg table.

In [36]:
import pyarrow as pa


# Step 4: Add some data to the table

# Define some data
data = [
    {"id": 1, "name": "Alice"},
    {"id": 2, "name": "Bob"}
]
# Create a PyArrow Table from the list of dictionaries
arrow_table = pa.Table.from_pylist(data)

# Write the data to the table
arrow_table

pyarrow.Table
id: int64
name: string
----
id: [[1,2]]
name: [["Alice","Bob"]]

In [37]:
table.append(arrow_table)



Let's look again at metadata folder.

We start from the manifest list

In [39]:

from avro.datafile import DataFileReader
from avro.io import DatumReader

metadata_folder = './iceberg_warehouse/default.db/my_table/metadata'

reader = DataFileReader(open(os.path.join(metadata_folder, 'snap-3096979458630272547-0-e771a111-380e-49db-bb65-5420db7d0c90.avro'), "rb"), DatumReader())
for user in reader:
    # a generator to loop over dictionaries
    print(user)
reader.close()

{'manifest_path': 'file:///Users/marcosantoni/Desktop/data-lake-course/local_pyiceberg/iceberg_warehouse/default.db/my_table/metadata/e771a111-380e-49db-bb65-5420db7d0c90-m0.avro', 'manifest_length': 4367, 'partition_spec_id': 0, 'content': 0, 'sequence_number': 1, 'min_sequence_number': 1, 'added_snapshot_id': 3096979458630272547, 'added_files_count': 1, 'existing_files_count': 0, 'deleted_files_count': 0, 'added_rows_count': 2, 'existing_rows_count': 0, 'deleted_rows_count': 0, 'partitions': [], 'key_metadata': None}


Then look at the actual manifest file

In [38]:

from avro.datafile import DataFileReader
from avro.io import DatumReader

metadata_folder = './iceberg_warehouse/default.db/my_table/metadata'

reader = DataFileReader(open(os.path.join(metadata_folder, 'e771a111-380e-49db-bb65-5420db7d0c90-m0.avro'), "rb"), DatumReader())
for user in reader:
    # a generator to loop over dictionaries
    print(user)
reader.close()

{'status': 1, 'snapshot_id': 3096979458630272547, 'sequence_number': None, 'file_sequence_number': None, 'data_file': {'content': 0, 'file_path': 'file:///Users/marcosantoni/Desktop/data-lake-course/local_pyiceberg/iceberg_warehouse/default.db/my_table/data/00000-0-e771a111-380e-49db-bb65-5420db7d0c90.parquet', 'file_format': 'PARQUET', 'partition': {}, 'record_count': 2, 'file_size_in_bytes': 915, 'column_sizes': [{'key': 1, 'value': 118}, {'key': 2, 'value': 90}], 'value_counts': [{'key': 1, 'value': 2}, {'key': 2, 'value': 2}], 'null_value_counts': [{'key': 1, 'value': 0}, {'key': 2, 'value': 0}], 'nan_value_counts': [], 'lower_bounds': [{'key': 1, 'value': b'\x01\x00\x00\x00\x00\x00\x00\x00'}, {'key': 2, 'value': b'Alice'}], 'upper_bounds': [{'key': 1, 'value': b'\x02\x00\x00\x00\x00\x00\x00\x00'}, {'key': 2, 'value': b'Bob'}], 'key_metadata': None, 'split_offsets': [4], 'equality_ids': None, 'sort_order_id': None}}


/Users/marcosantoni/miniconda3/envs/data_file_formats/lib/python3.12/site-packages/avro/schema.py:1233: IgnoredLogicalType: Unknown map, using array.


Let's add another record

In [40]:
# Define some data
data = [
    {"id": 3, "name": "Daniel"}
]
# Create a PyArrow Table from the list of dictionaries
arrow_table = pa.Table.from_pylist(data)
table.append(arrow_table)





We have quite some extra stuff in the `data` and `metadata` folders