Pull data from the `nps-public-data` [BigQuery project](https://github.com/tonymet/nps-public-data), load to DuckDB tables and export to parquet.

In [1]:
import duckdb
from google.cloud import bigquery

con = duckdb.connect("../data/nps.db")

bqclient = bigquery.Client(project="nps-public-data")

dataset_ref = f"nps-public-data.nps_public_data"

In [2]:
schema = "nps_public_data"

con.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")

tables = bqclient.list_tables(dataset_ref)

for table in tables:

    table_str = f"{table.project}.{table.dataset_id}.{table.table_id}"

    table = bigquery.TableReference.from_string(table_str)

    rows = bqclient.list_rows(table)

    # this is a terrible pun
    arr_rows = rows.to_arrow(create_bqstorage_client=True)

    sql_ref = f"{schema}.{table.table_id}"

    con.sql(
        f"""
        DROP TABLE IF EXISTS {sql_ref};
        CREATE TABLE {sql_ref} AS SELECT * FROM arr_rows;
        """
    )

    print(table_str)

con.sql("SHOW ALL tables")

nps-public-data.nps_public_data.activities
nps-public-data.nps_public_data.activities__parks
nps-public-data.nps_public_data.alerts
nps-public-data.nps_public_data.amenities
nps-public-data.nps_public_data.amenities__parks
nps-public-data.nps_public_data.amenities__parksplaces
nps-public-data.nps_public_data.amenities__parksvisitorcenters
nps-public-data.nps_public_data.amenities__parkvisitorcenters
nps-public-data.nps_public_data.articles
nps-public-data.nps_public_data.campgrounds
nps-public-data.nps_public_data.events
nps-public-data.nps_public_data.feespasses
nps-public-data.nps_public_data.lessonplans
nps-public-data.nps_public_data.meta
nps-public-data.nps_public_data.multimedia__audio
nps-public-data.nps_public_data.multimedia__galleries
nps-public-data.nps_public_data.multimedia__galleries__assets
nps-public-data.nps_public_data.multimedia__videos
nps-public-data.nps_public_data.newsreleases
nps-public-data.nps_public_data.parkinglots
nps-public-data.nps_public_data.parks
nps-p

┌──────────┬─────────────────┬──────────────────────┬──────────────────────┬───────────────────────────────┬───────────┐
│ database │     schema      │         name         │     column_names     │         column_types          │ temporary │
│ varchar  │     varchar     │       varchar        │      varchar[]       │           varchar[]           │  boolean  │
├──────────┼─────────────────┼──────────────────────┼──────────────────────┼───────────────────────────────┼───────────┤
│ nps      │ nps_public_data │ activities           │ [name, id]           │ [VARCHAR, VARCHAR]            │ false     │
│ nps      │ nps_public_data │ activities__parks    │ [parks, name, id]    │ [STRUCT("name" VARCHAR, ful…  │ false     │
│ nps      │ nps_public_data │ alerts               │ [relatedRoadEvents…  │ [STRUCT("type" VARCHAR, url…  │ false     │
│ nps      │ nps_public_data │ amenities            │ [categories, name,…  │ [VARCHAR[], VARCHAR, VARCHAR] │ false     │
│ nps      │ nps_public_data │ a

In [5]:
dbname = "nps"
con.sql(
    f"EXPORT DATABASE '../data/{dbname}' (FORMAT PARQUET, COMPRESSION ZSTD, ROW_GROUP_SIZE 100000);"
)

con.close()