In [1]:
from datetime import UTC, datetime
from pathlib import Path

import lancedb
import pyarrow as pa

In [2]:
# def fixed_vec_type(dim: int, dtype):
#     # Compatible with older pyarrow versions
#     return pa.list_(dtype, list_size=dim)

In [3]:
def upsert_config_metadata(cfg_tbl, kv: dict):
    # Load existing config into memory
    df = cfg_tbl.to_pandas()

    # Update / insert keys
    for k, v in kv.items():
        k, v = str(k), str(v)
        if k in df["key"].values:
            df.loc[df["key"] == k, "value"] = v
        else:
            df.loc[len(df)] = [k, v]

    # Rewrite table (small table → safe & fast)
    new_tbl = pa.Table.from_pandas(df, preserve_index=False)
    cfg_tbl.delete("true")  # delete all rows
    cfg_tbl.add(new_tbl)

In [4]:
def open_config_table(db, name):
    # Will throw e if missing
    return db.open_table(name)

In [5]:
def get_table_names(db):
    t = db.list_tables()
    return list(t)  # works for ListTablesResponse and normal lists

In [6]:
def ensure_tables(
    db_uri: str,
    image_table: str,
    patch_table: str,
    dim: int,
    vec_dtype: str,
):
    db = lancedb.connect(db_uri)

    # dtype = pa.float16() if vec_dtype.lower() in ("fp16", "float16") else pa.float32()
    # vec = pa.list_(dtype)

    # --- Image embeddings schema (1 row per image) ---
    image_schema = pa.schema(
        [
            pa.field("image_id", pa.string()),  # same as source image id
        ]
    )

    # --- Patch embeddings schema (1 row per patch) ---
    patch_schema = pa.schema(
        [
            pa.field("patch_id", pa.string()),  # f"{image_id}:{patch_index}"
            pa.field("image_id", pa.string()),  # source image id
            pa.field("patch_index", pa.int32()),
        ]
    )

    # Create if missing
    # tables = get_table_names(db)

    img_tbl = db.create_table(image_table, schema=image_schema, mode="overwrite")

    pat_tbl = db.create_table(patch_table, schema=patch_schema, mode="overwrite")

    return db, img_tbl, pat_tbl

In [7]:
PROJECT_ROOT = Path.cwd().parent.parent

# Point to your existing database directory

SOURCE_URI = PROJECT_ROOT / "data" / "lancedb" / "shared_source"
IMG_RAW_TBL_NAME = "era5_sample_images"

PROJECT_NAME = "dinov3"
AUTHOR = "Cherukuru. N. W"
DB_URI = PROJECT_ROOT / "data" / "lancedb" / "experiments" / "era5"
CONFIG_TBL_NAME = PROJECT_NAME + "_config"
IMG_EMB_TBL_NAME = PROJECT_NAME + "_image_embeddings"
PATCH_EMB_TBL_NAME = PROJECT_NAME + "_patch_embeddings"

EMB_DIM = 768

VEC_DTYPE = "float16"  # or "float32"

MODEL = "vit_large_patch14_dinov2.lvd142m"

In [8]:
CONFIG_TBL_NAME

'dinov3_config'

In [9]:
db, img_tbl, patch_tbl = ensure_tables(
    DB_URI,
    image_table=IMG_EMB_TBL_NAME,
    patch_table=PATCH_EMB_TBL_NAME,
    dim=EMB_DIM,
    vec_dtype=VEC_DTYPE,
)


config_data = [
    {"key": "created_at", "value": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")},
    {"key": "author", "value": AUTHOR},
    {"key": "source", "value": IMG_RAW_TBL_NAME},
    {"key": "source path", "value": str(SOURCE_URI.relative_to(PROJECT_ROOT))},
    {"key": "tbl_img_emb", "value": IMG_EMB_TBL_NAME},
    {"key": "tbl_patch_emb", "value": PATCH_EMB_TBL_NAME},
]


# Note: We let LanceDB infer the simple schema (key: str, value: str) automatically
# by passing the list of dicts directly.
config_table = db.create_table(CONFIG_TBL_NAME, data=config_data, mode="overwrite")
print(f"Table {CONFIG_TBL_NAME} created with global metadata.")

Table dinov3_config created with global metadata.


In [10]:
print("Tables ready:", img_tbl.name, patch_tbl.name, config_table.name)

Tables ready: dinov3_image_embeddings dinov3_patch_embeddings dinov3_config


In [11]:
img_tbl.schema

image_id: string

In [12]:
patch_tbl.schema

patch_id: string
image_id: string
patch_index: int32

In [13]:
config_table.schema
# Convert to Pandas DataFrame for a table print
df_config = config_table.to_pandas()
print(df_config)

             key                       value
0     created_at        2026-02-12T07:09:07Z
1         author             Cherukuru. N. W
2         source          era5_sample_images
3    source path  data/lancedb/shared_source
4    tbl_img_emb     dinov3_image_embeddings
5  tbl_patch_emb     dinov3_patch_embeddings


In [14]:
!{sys.executable} ./helpers/v3_dino_embeddings_lancedb.py \
  --db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/shared_source \
  --table era5_sample_images \
  --img_id_field id \
  --out_prefix {PROJECT_NAME} \
  --config_db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5 \
  --config_table dinov3_config \
  --model vit_base_patch16_dinov3 \
  --batch 512 \
  --scan_batch 2000 \
  --workers 16



Processed=1095  Skipped(missing_blob)=0  Skipped(decode_fail)=0

Done.
- run_id: bc657c4f-c351-4d40-b612-333a5e381970
- processed: 1095
- skipped_blob: 0
- skipped_decode: 0
- device: cuda
- dtype_used: fp16
- image_size: 256
- patch_size: 16
- tokens_total: 261
- img_emb_table: dinov3_image_embeddings
- patch_emb_table: dinov3_patch_embeddings
- config_table: dinov3_config


In [15]:
# Path where your config DB lives
config_db_path = "/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5"

# Name of your config table
config_table_name = "dinov3_config"

# Connect and open table
db = lancedb.connect(config_db_path)
tbl = db.open_table(config_table_name)

# Load into pandas for easy viewing
df = tbl.to_pandas()

df

Unnamed: 0,key,value
0,author,Cherukuru. N. W
1,source,era5_sample_images
2,source path,data/lancedb/shared_source
3,tbl_img_emb,dinov3_image_embeddings
4,tbl_patch_emb,dinov3_patch_embeddings
5,created_at,2026-02-12T07:09:13Z
6,run_id,bc657c4f-c351-4d40-b612-333a5e381970
7,raw_db_uri,/glade/work/ncheruku/research/bams-ai-data-exp...
8,raw_table,era5_sample_images
9,raw_img_id_field,id


In [16]:
df[df.key == "tbl_img_emb"].value.iloc[0]

'dinov3_image_embeddings'

In [17]:
import os


def dir_size_bytes(path: Path) -> int:
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += (Path(root) / f).stat().st_size
    return total


# table_path = db_dir / "era5_sample_images.lance"


size_bytes = dir_size_bytes("/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5/dinov3_patch_embeddings.lance")

# size_bytes = dir_size_bytes(table_path)


print(f"{size_bytes / 1024**2:.2f} MB")

843.99 MB


In [18]:
db = lancedb.connect(str(DB_URI))
patch_tbl = db.open_table(PATCH_EMB_TBL_NAME)

In [19]:
patch_tbl.schema

patch_id: string
image_id: string
patch_index: int32
embedding: fixed_size_list<item: float>[768]
  child 0, item: float

In [20]:
row = patch_tbl.search().limit(1).to_pandas().iloc[0]
row["image_id"]

'2ab05a5c845ced5a7fdc93ad28f6bc73'

In [21]:
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"Is CUDA available? {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")

PyTorch Version: 2.9.1+cu128
Is CUDA available? True
CUDA Version: 12.8


In [22]:
patch_tbl.create_index(metric="cosine", index_type="IVF_PQ", num_partitions=128, num_sub_vectors=96, accelerator="cuda", vector_column_name="embedding")

  return torch.from_numpy(nparr)
 68%|██████▊   | 34/50 [00:10<00:04,  3.40it/s]


  0%|          | 0/280320 [00:00<?, ?it/s]

100%|██████████| 50/50 [00:08<00:00,  5.67it/s]
100%|██████████| 50/50 [00:00<00:00, 107.85it/s]
100%|██████████| 50/50 [00:00<00:00, 113.79it/s]
100%|██████████| 50/50 [00:00<00:00, 112.52it/s]
100%|██████████| 50/50 [00:00<00:00, 106.44it/s]
 96%|█████████▌| 48/50 [00:00<00:00, 102.15it/s]
100%|██████████| 50/50 [00:00<00:00, 104.47it/s]
100%|██████████| 50/50 [00:00<00:00, 110.45it/s]
100%|██████████| 50/50 [00:00<00:00, 109.65it/s]
100%|██████████| 50/50 [00:00<00:00, 110.87it/s]
100%|██████████| 50/50 [00:00<00:00, 98.65it/s] 
100%|██████████| 50/50 [00:00<00:00, 105.92it/s]
100%|██████████| 50/50 [00:00<00:00, 93.86it/s] 
 98%|█████████▊| 49/50 [00:00<00:00, 106.90it/s]
 94%|█████████▍| 47/50 [00:00<00:00, 110.90it/s]
100%|██████████| 50/50 [00:00<00:00, 112.51it/s]
100%|██████████| 50/50 [00:00<00:00, 112.02it/s]
100%|██████████| 50/50 [00:00<00:00, 114.22it/s]
100%|██████████| 50/50 [00:00<00:00, 112.06it/s]
100%|██████████| 50/50 [00:00<00:00, 106.47it/s]
100%|██████████| 50/5

  0%|          | 0/280320 [00:00<?, ?it/s]