In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Ridership Open Lakehouse Demo (Part 1): Load data to BigQuery Iceberg tables

This notebook will demonstrate a strategy to implement an open lakehouse on GCP, using Apache Iceberg,
as an open source standard for managing data, while still leveraging GCP native capabilities. This demo will use
BigQuery Manged Iceberg Tables, Managed Apache Kafka and Apache Kafka Connect to ingest streaming data, Vertex AI for Generative AI queries on top of the data and Dataplex to govern tables.

This notebook will load data into BigQuery, backed by Parquet files, in the Apache Iceberg specification.

All data in this notebook was prepared in the previous `part0` notebook.

## Setup the environment

In [None]:
import os
USER_AGENT = "cloud-solutions/data-to-ai-nb-v3"

PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
BQ_DATASET = "ridership_lakehouse"
BUCKET_NAME = f"{PROJECT_ID}-ridership-lakehouse"
LOCATION = "us-central1"
BQ_CONNECTION_NAME = "cloud-resources-connection"

print(PROJECT_ID)
print(BUCKET_NAME)

In [None]:
from google.cloud import bigquery, storage
from google.api_core.client_info import ClientInfo

bigquery_client = bigquery.Client(
    project=PROJECT_ID,
    location=LOCATION,
    client_info=ClientInfo(user_agent=USER_AGENT)
)
storage_client = storage.Client(
    project=PROJECT_ID,
    client_info=ClientInfo(user_agent=USER_AGENT)
)

bucket = storage_client.bucket(BUCKET_NAME)

In [None]:
# create/reference the bq dataset, and clean all tables
dataset_ref = bigquery.Dataset(f"{PROJECT_ID}.{BQ_DATASET}")
dataset_ref.location = LOCATION
dataset = bigquery_client.create_dataset(dataset_ref, exists_ok=True)

for table in bigquery_client.list_tables(dataset):
    bigquery_client.delete_table(table)

In [None]:
# Some helper functions

import pandas as pd

pd.set_option('display.max_colwidth', None)

def display_blobs_with_prefix(prefix: str, top=20):
  blobs = [[b.name, b.size, b.content_type, b.updated] for b in
         storage_client.list_blobs(BUCKET_NAME, prefix=prefix, )]
  df = pd.DataFrame(blobs, columns=["Name", "Size", "Content Type", "Updated"])
  return df.head(top)

def delete_blobs_with_prefix(prefix: str):
  blobs = storage_client.list_blobs(BUCKET_NAME, prefix=prefix)
  for blob in blobs:
    blob.delete()


def select_top_rows(table_name: str, num_rows: int=10):
  query = f"""
  SELECT *
  FROM `{PROJECT_ID}.{BQ_DATASET}.{table_name}`
  LIMIT {num_rows}
  """
  return bigquery_client.query(query).to_dataframe()

## Create the tables and load data

BigQuery offers two ways to work with Apache Iceberg tables: **BigLake External Tables for Iceberg** and **BigQuery Tables for Apache Iceberg**. For most migration and native BigQuery use cases, **BigQuery Tables for Apache Iceberg (managed by BigQuery) is the strongly preferred method.**

-----

**1\. BigLake External Tables for Iceberg (Managed Externally — Read-Only)**

These tables allow BigQuery to query Iceberg data managed by external systems like Spark or Hive. They are best for hybrid setups where multiple tools need read access and an external system controls the table's lifecycle.

**SQL Example:**

```sql
CREATE OR REPLACE EXTERNAL TABLE `your-project.your_dataset.your_external_iceberg_table`
  WITH CONNECTION `your-region.your_connection_name`
  OPTIONS (
         format = 'ICEBERG',
         uris = ["gs://mybucket/mydata/mytable/metadata/iceberg.metadata.json"]
   )
```

**Key Points:**

  * **External Control:** Metadata and data managed outside BigQuery.
  * **Read-Only:** BigQuery can only query; DML operations are not supported.
  * **Hybrid Fit:** Ideal for shared access from various tools.
  * **Metadata:** Manual updates for static JSON pointers; BigLake Metastore preferred for dynamic syncing in GCP.

-----

**2\. BigQuery Tables for Apache Iceberg (Managed by BigQuery)**

**This is the recommended approach for migrating your data and integrating Iceberg within BigQuery.** These tables offer full BigQuery management of Iceberg, eliminating the need for a separate catalog.

**SQL Example:**

```sql
CREATE OR REPLACE TABLE `your-project.your_dataset.your_iceberg_table`(
    <column_definition>
)
WITH CONNECTION `your-region.your_connection_name`
OPTIONS (
    file_format = 'PARQUET',
    table_format = 'ICEBERG',
    storage_uri = 'gs://your-bucket/iceberg/your_table_name'
);
```

**Why Preferred for Migration:**

BigQuery-managed Iceberg tables unlock powerful features essential for modern data solutions:

  * **Native Integration:** Seamless experience, similar to standard BigQuery tables.
  * **Full DML Support:** Perform `INSERT`, `UPDATE`, `DELETE`, `MERGE` directly with GoogleSQL.
  * **Unified Ingestion:** Supports both batch and high-throughput streaming via the Storage Write API.
  * **Schema Evolution:** BigQuery handles schema changes (add, drop, rename columns, type changes) effortlessly.
  * **Automatic Optimization:** Benefits from BigQuery's built-in optimizations like adaptive file sizing, clustering, and garbage collection.
  * **Robust Security:** Leverage BigQuery's column-level security and data masking.
  * **Simplified Operations:** Reduced overhead by letting BigQuery manage the Iceberg table lifecycle.

This method provides a more robust, integrated, and efficient way to leverage Iceberg data within the BigQuery ecosystem.

The only downside to this method is that you cannot "import" the existing metadata, but you'll have to let BigQuery manage it for you.

### The `bus_stations` table

This table will be loaded as a BigQuery Iceberg table (option 2)- managed by BigQuery, read-only access to other processing engines.


In [None]:
bus_stops_prefix = "iceberg_data/bus_stations"
bus_stops_uri = f"gs://{BUCKET_NAME}/{bus_stops_prefix}/"

# Clear the GCS path before
delete_blobs_with_prefix(bus_stops_prefix)
display_blobs_with_prefix(bus_stops_prefix)

In [None]:
# create the table
bigquery_client.query(f"DROP TABLE IF EXISTS {BQ_DATASET}.bus_stations;").result()
query = f"""
CREATE TABLE {BQ_DATASET}.bus_stations
(
  bus_stop_id INTEGER,
  address STRING,
  school_zone BOOLEAN,
  seating BOOLEAN,
  borough STRING,
  latitude FLOAT64,
  longtitude FLOAT64
)
WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
OPTIONS (
  file_format = 'PARQUET',
  table_format = 'ICEBERG',
  storage_uri = '{bus_stops_uri}');
"""
bigquery_client.query(query).result()

In [None]:
# We can view the GCS path, and see that there is now an ICEBERG metadata file, but no data
display_blobs_with_prefix(bus_stops_prefix)

In [None]:
# we will now load the data from the CSV in GCS

# BQ tables for Apache Iceberg do not support load with truncating, so we will truncate manually, and then load
truncate = bigquery_client.query(f"DELETE FROM {BQ_DATASET}.bus_stations WHERE TRUE")
truncate.result()

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
)

job = bigquery_client.load_table_from_uri(
    f"gs://{BUCKET_NAME}/mta_staging_data/bus_stations.csv",
    dataset.table("bus_stations"),
    job_config=job_config,
)

job.result()

In [None]:
# We can verify that the data is actually loaded in the iceberg specification and the format used is parquet
display_blobs_with_prefix(bus_stops_prefix)

We can see in the ourput that we have 8 parquet files generated under the `iceberg_data/bus_stations/data/` folder, and one `v0.metadata.json` under the `iceberg_data/bus_stations/metadata/` folder.

In [None]:
select_top_rows("bus_stations")

### The `bus_lines` table

For the `bus_lines` table, we want to simulate a table that is managed by Spark, and BigQuery is just needs to read the table.

For that we will use the `EXTERNAL` Iceberg tables (method 1), managed by OSS engines, read-only by BigQuery.

To simulate that, we will start a PySpark process to load the data in Iceberg format, and expose the metadata to BigQuery.

In [None]:
# Define environment variables or set them directly
WAREHOUSE_PREFIX = "external_iceberg_warehouse"

# make sure the destination to the warehouse is empty
delete_blobs_with_prefix(WAREHOUSE_PREFIX)
display_blobs_with_prefix(WAREHOUSE_PREFIX)

In [None]:
from google.cloud.dataproc_spark_connect import DataprocSparkSession
from google.cloud.dataproc_v1 import Session

WAREHOUSE = f"gs://{BUCKET_NAME}/{WAREHOUSE_PREFIX}/"

session = Session()

catalog = "buses"

session.runtime_config.properties[f"spark.sql.catalog.{catalog}"] = "org.apache.iceberg.spark.SparkCatalog"
session.runtime_config.properties[f"spark.sql.catalog.{catalog}.type"] = "hadoop"
# session.runtime_config.properties[f"spark.sql.catalog.{catalog}.catalog-impl"] = "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog"
session.runtime_config.properties[f"spark.sql.catalog.{catalog}.gcp_project"] = f"{PROJECT_ID}"
session.runtime_config.properties[f"spark.sql.catalog.{catalog}.gcp_location"] = f"{LOCATION}"
session.runtime_config.properties[f"spark.sql.catalog.{catalog}.warehouse"] = f"{WAREHOUSE}"


# Create the Spark session. This will take some time.
spark = (
    DataprocSparkSession.builder
      .appName("mount-bus-lines")
      .dataprocSessionConfig(session)
      .getOrCreate()
)

In [None]:
df = spark.read.format("parquet").load(f"gs://{BUCKET_NAME}/mta_staging_data/bus_lines/")
df.write.format("iceberg").mode("overwrite").saveAsTable(f"{catalog}.bus_lines")


In [None]:
# We'll verify the blobs are written
display_blobs_with_prefix(WAREHOUSE_PREFIX)

In [None]:
# Now we will mount the iceberg data as an external, read-only, table in bigquery
bigquery_client.query(f"DROP TABLE IF EXISTS {BQ_DATASET}.bus_lines;").result()

# NOTE: we are pointing directly to the v1.metadata.json file
# in a real world scenario, it would be better to read the content of the "version-hint.text"
# in order to know the latest version of the metadata
bigquery_client.query(f"""
CREATE OR REPLACE EXTERNAL TABLE `{BQ_DATASET}.bus_lines`
  WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
  OPTIONS (
         format = 'ICEBERG',
         uris = ["gs://{BUCKET_NAME}/{WAREHOUSE_PREFIX}/bus_lines/metadata/v1.metadata.json"]
   )
""").result()


In [None]:
# show sample rows
select_top_rows("bus_lines")

### The `ridership` table

Lastly, the `ridership` table will be loaded just like the `bus_stations` table, but this time we will cluster the table by the timestamp.

In [None]:
ridership_uri = f"gs://{BUCKET_NAME}/iceberg_data/ridership/"

blob_list = bucket.list_blobs(match_glob=f"{ridership_uri}/*")
blob_list = [blob for blob in blob_list]
bucket.delete_blobs(blob_list)

bigquery_client.query(
    f'DROP TABLE IF EXISTS {BQ_DATASET}.ridership;'
).result()
_create_table_stmt = f"""
    CREATE TABLE {BQ_DATASET}.ridership (
        transit_timestamp TIMESTAMP,
        station_id INTEGER,
        ridership INTEGER
    )
    CLUSTER BY transit_timestamp
    WITH CONNECTION `{PROJECT_ID}.{LOCATION}.{BQ_CONNECTION_NAME}`
    OPTIONS (
        file_format = 'PARQUET',
        table_format = 'ICEBERG',
        storage_uri = '{ridership_uri}'
    );
"""
bigquery_client.query(_create_table_stmt).result()

In [None]:
# Load data into the table
table_ref = dataset_ref.table("ridership")

# BQ tables for Apache Iceberg do not support load with truncating, so we will truncate manually, and then load
truncate = bigquery_client.query(f"DELETE FROM {BQ_DATASET}.ridership WHERE TRUE")
truncate.result()

job_config = bigquery.LoadJobConfig(
    write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
    source_format=bigquery.SourceFormat.PARQUET,
)

job = bigquery_client.load_table_from_uri(
    f"gs://{BUCKET_NAME}/mta_staging_data/ridership/*.parquet",
    table_ref,
    job_config=job_config,
)

job.result()

In [None]:
# show sample rows
select_top_rows("ridership")