## Parte 1

In [39]:
import shutil
import os

table_dir = "rizzoli_warehouse"

if os.path.exists(table_dir):
    shutil.rmtree(table_dir)

os.makedirs(table_dir, exist_ok=True)

In [40]:
from pyiceberg.catalog.sql import SqlCatalog

warehouse_path = os.path.abspath(f"./{table_dir}")
catalog = SqlCatalog(
    "acme_corp",
    **{
        "uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
        "warehouse": f"file://{warehouse_path}",
    },
)

In [41]:
catalog.create_namespace("registry")

## Parte 2

In [42]:
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, LongType, DateType

schema = Schema(
    NestedField(field_id=1, name="id", field_type=LongType(), required=False),
    NestedField(field_id=1, name="name", field_type=StringType(), required=False),
    NestedField(field_id=1, name="hire_date", field_type=DateType(), required=False),
)

table = catalog.create_table("registry.employees", schema)

In [43]:
import pyarrow as pa
from datetime import date
data = [
    {"id": 1, "name": "Alice", "hire_date": date(2020, 1, 1)},
    {"id": 2, "name": "Bob", "hire_date": date(2020, 1, 2)},
    {"id": 3, "name": "Charlie", "hire_date": date(2020, 1, 3)},
]
# Create a PyArrow Table from the list of dictionaries
arrow_table = pa.Table.from_pylist(data)

table.append(arrow_table)



## Parte 3

In [44]:
data = [
    {"id": 4, "name": "David", "hire_date": date(2020, 1, 4)},
    {"id": 5, "name": "Eve", "hire_date": date(2020, 1, 5)},
]
# Create a PyArrow Table from the list of dictionaries
arrow_table = pa.Table.from_pylist(data)

table.append(arrow_table)



In [45]:
from pyiceberg.table import Table

def count_table_snapshots(table: Table) -> int:
    return len(table.snapshots())

print(count_table_snapshots(table))

2


## Parte 4

In [46]:
def rows_at_second_last_snapshot(table: Table) -> int:
    """ table: pyicerbeg Table object
    return: number of rows in the second last snapshot of the table
    """
    snapshots = table.snapshots()
    second_last_snapshot_id = snapshots[-2].snapshot_id
    return table.scan(snapshot_id=second_last_snapshot_id).to_arrow().shape[0]

print(rows_at_second_last_snapshot(table))

3
