In [26]:
from pyiceberg.catalog import load_catalog
import pyarrow.parquet as pq
import os
from pathlib import Path
import geopandas as gpd
from shapely import wkb

In [29]:
def add_parquet_to_catalog(file_path, table_name):
    # Check if table already exists
    if catalog.table_exists(f"hydrofabric.{table_name}"):
        print(f"Table {table_name} already exists, loading it")
        return catalog.load_table(f"hydrofabric.{table_name}")
    
    # Read the parquet file
    arrow_table = pq.read_table(file_path)
    
    # Create the table in the catalog
    iceberg_table = catalog.create_table(
        f"hydrofabric.{table_name}",
        schema=arrow_table.schema,
    )
    
    # Append the data to the table
    iceberg_table.append(arrow_table)
    
    print(f"Added {file_path} as table {table_name}")
    return iceberg_table

In [30]:
# Get all parquet files from the directory
parquet_dir = "../data/parquet"
parquet_files = list(Path(parquet_dir).glob("*.parquet"))

# Dictionary to store all tables
tables = {}

# Add each parquet file to the catalog
for parquet_file in parquet_files:
    table_name = parquet_file.stem  # Get filename without extension
    tables[table_name] = add_parquet_to_catalog(str(parquet_file), table_name)

Added ../data/parquet/network.parquet as table network
Added ../data/parquet/nexus.parquet as table nexus
Added ../data/parquet/flowpath-attributes.parquet as table flowpath-attributes
Added ../data/parquet/divides.parquet as table divides
Added ../data/parquet/pois.parquet as table pois
Added ../data/parquet/flowpath-attributes-ml.parquet as table flowpath-attributes-ml
Added ../data/parquet/divide-attributes.parquet as table divide-attributes
Added ../data/parquet/flowpaths.parquet as table flowpaths
Added ../data/parquet/hydrolocations.parquet as table hydrolocations
Added ../data/parquet/lakes.parquet as table lakes


In [31]:
print("Tables in the catalog:")
for table_id in catalog.list_tables("hydrofabric"):
    print(f"- {table_id}")

Tables in the catalog:
- ('hydrofabric', 'divide-attributes')
- ('hydrofabric', 'divides')
- ('hydrofabric', 'flowpath-attributes')
- ('hydrofabric', 'flowpath-attributes-ml')
- ('hydrofabric', 'flowpaths')
- ('hydrofabric', 'hydrolocations')
- ('hydrofabric', 'lakes')
- ('hydrofabric', 'network')
- ('hydrofabric', 'nexus')
- ('hydrofabric', 'pois')


In [40]:
def create_geodataframe(df):
    if 'geometry' in df.columns:
        df['geometry'] = df['geometry'].apply(
            lambda x: wkb.loads(x) if x is not None else None
        )
        return gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:5070")
    return df

In [41]:
divides_df = tables['divides'].scan(
    row_filter="divide_id == 'cat-5539'",
).to_pandas()
divides_gdf = create_geodataframe(divides_df)
divides_gdf

Unnamed: 0,divide_id,toid,type,ds_id,areasqkm,vpuid,id,lengthkm,tot_drainage_areasqkm,has_flowline,geometry
0,cat-5539,nex-5540,network,,10.4985,1,wb-5539,6.449001,2247.957002,True,"POLYGON ((2059095 2991255, 2059095 2990985, 20..."
