In [1]:
from datetime import UTC, datetime
from pathlib import Path

import lancedb
import pyarrow as pa

In [2]:
# def fixed_vec_type(dim: int, dtype):
#     # Compatible with older pyarrow versions
#     return pa.list_(dtype, list_size=dim)

In [3]:
def upsert_config_metadata(cfg_tbl, kv: dict):
    # Load existing config into memory
    df = cfg_tbl.to_pandas()

    # Update / insert keys
    for k, v in kv.items():
        k, v = str(k), str(v)
        if k in df["key"].values:
            df.loc[df["key"] == k, "value"] = v
        else:
            df.loc[len(df)] = [k, v]

    # Rewrite table (small table â†’ safe & fast)
    new_tbl = pa.Table.from_pandas(df, preserve_index=False)
    cfg_tbl.delete("true")  # delete all rows
    cfg_tbl.add(new_tbl)

In [4]:
def open_config_table(db, name):
    # Will throw e if missing
    return db.open_table(name)

In [5]:
def ensure_tables(
    db_uri: str,
    image_table: str,
    patch_table: str,
    dim: int,
    vec_dtype: str,
):
    db = lancedb.connect(db_uri)

    # dtype = pa.float16() if vec_dtype.lower() in ("fp16", "float16") else pa.float32()
    # vec = pa.list_(dtype)

    # --- Image embeddings schema (1 row per image) ---
    image_schema = pa.schema(
        [
            pa.field("image_id", pa.string()),  # same as source image id
        ]
    )

    # --- Patch embeddings schema (1 row per patch) ---
    patch_schema = pa.schema(
        [
            pa.field("patch_id", pa.string()),  # f"{image_id}:{patch_index}"
            pa.field("image_id", pa.string()),  # source image id
            pa.field("patch_index", pa.int32()),
        ]
    )

    # Create if missing
    if image_table in db.table_names():
        db.drop_table(image_table)
    img_tbl = db.create_table(image_table, schema=image_schema)

    if patch_table in db.table_names():
        db.drop_table(patch_table)
    pat_tbl = db.create_table(patch_table, schema=patch_schema)

    return db, img_tbl, pat_tbl

In [6]:
PROJECT_ROOT = Path.cwd().parent.parent

# Point to your existing database directory

SOURCE_URI = PROJECT_ROOT / "data" / "lancedb" / "shared_source"
IMG_RAW_TBL_NAME = "era5_sample_images"

PROJECT_NAME = "dinov3"
AUTHOR = "Cherukuru. N. W"
DB_URI = PROJECT_ROOT / "data" / "lancedb" / "experiments" / "era5"
CONFIG_TBL_NAME = PROJECT_NAME + "_config"
IMG_EMB_TBL_NAME = PROJECT_NAME + "_image_embeddings"
PATCH_EMB_TBL_NAME = PROJECT_NAME + "_patch_embeddings"

EMB_DIM = 768

VEC_DTYPE = "float16"  # or "float32"

MODEL = "vit_large_patch14_dinov2.lvd142m"

In [7]:
CONFIG_TBL_NAME

'dinov3_config'

In [8]:
db, img_tbl, patch_tbl = ensure_tables(
    DB_URI,
    image_table=IMG_EMB_TBL_NAME,
    patch_table=PATCH_EMB_TBL_NAME,
    dim=EMB_DIM,
    vec_dtype=VEC_DTYPE,
)


config_data = [
    {"key": "created_at", "value": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")},
    {"key": "author", "value": AUTHOR},
    {"key": "source", "value": IMG_RAW_TBL_NAME},
    {"key": "source path", "value": str(SOURCE_URI.relative_to(PROJECT_ROOT))},
    {"key": "tbl_img_emb", "value": IMG_EMB_TBL_NAME},
    {"key": "tbl_patch_emb", "value": PATCH_EMB_TBL_NAME},
]

# Create the config table
# We use overwrite mode to ensure we don't have stale configs if re-running
if CONFIG_TBL_NAME in db.table_names():
    db.drop_table(CONFIG_TBL_NAME)

# Note: We let LanceDB infer the simple schema (key: str, value: str) automatically
# by passing the list of dicts directly.
config_table = db.create_table(CONFIG_TBL_NAME, data=config_data)
print(f"Table {CONFIG_TBL_NAME} created with global metadata.")

Table dinov3_config created with global metadata.


In [9]:
print("Tables ready:", img_tbl.name, patch_tbl.name, config_table.name)

Tables ready: dinov3_image_embeddings dinov3_patch_embeddings dinov3_config


In [10]:
img_tbl.schema

image_id: string

In [11]:
patch_tbl.schema

patch_id: string
image_id: string
patch_index: int32

In [12]:
config_table.schema
# Convert to Pandas DataFrame for a table print
df_config = config_table.to_pandas()
print(df_config)

             key                       value
0     created_at        2026-01-13T07:57:06Z
1         author             Cherukuru. N. W
2         source          era5_sample_images
3    source path  data/lancedb/shared_source
4    tbl_img_emb     dinov3_image_embeddings
5  tbl_patch_emb     dinov3_patch_embeddings


In [13]:
!{sys.executable} ./helpers/dino_embeddings_lancedb.py \
  --db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/shared_source \
  --table era5_sample_images \
  --config_db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5 \
  --config_table dinov3_config \
  --model vit_base_patch16_dinov3 \
  --batch 512 \
  --scan_batch 2000 \
  --workers 16 \
  --shard_size 1000 \
  --save_embeddings

Processed=256  Skipped(missing_blob)=0  Skipped(decode_fail)=0
Processed=512  Skipped(missing_blob)=0  Skipped(decode_fail)=0
Processed=768  Skipped(missing_blob)=0  Skipped(decode_fail)=0
Processed=1024  Skipped(missing_blob)=0  Skipped(decode_fail)=0
Processed=1095  Skipped(missing_blob)=0  Skipped(decode_fail)=0

Done.
- run_id:        c73088df-adea-4e6d-abd6-a3038f003866
- processed:     1095
- skipped_blob:  0
- skipped_decode:0
- device:        cuda
- dtype_used:    fp16
- img_size:      256
- Embeddings:    preprocessed/embeddings
- Tensors:       (skipped)
- Config table:  dinov3_config (appended)


In [14]:
# Path where your config DB lives
config_db_path = "/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5"

# Name of your config table
config_table_name = "dinov3_config"

# Connect and open table
db = lancedb.connect(config_db_path)
tbl = db.open_table(config_table_name)

# Load into pandas for easy viewing
df = tbl.to_pandas()

df

Unnamed: 0,key,value
0,author,Cherukuru. N. W
1,created_at,2026-01-13T07:57:11Z
2,source,era5_sample_images
3,source path,/glade/work/ncheruku/research/bams-ai-data-exp...
4,tbl_img_emb,/glade/work/ncheruku/research/bams-ai-data-exp...
5,tbl_patch_emb,
6,run_id,c73088df-adea-4e6d-abd6-a3038f003866
7,db_uri,/glade/work/ncheruku/research/bams-ai-data-exp...
8,img_table,era5_sample_images
9,img_blob_field,image_blob


In [15]:
df[df.key == "tbl_img_emb"].value.iloc[0]

'/glade/work/ncheruku/research/bams-ai-data-exploration/notebooks/02-generate-embeddings/preprocessed/embeddings'