In [1]:
import json
from datetime import datetime, timezone, UTC

import lancedb
import pyarrow as pa
from pathlib import Path

In [2]:
# def fixed_vec_type(dim: int, dtype):
#     # Compatible with older pyarrow versions
#     return pa.list_(dtype, list_size=dim)

In [3]:
def upsert_config_metadata(cfg_tbl, kv: dict):
    # Load existing config into memory
    df = cfg_tbl.to_pandas()

    # Update / insert keys
    for k, v in kv.items():
        k, v = str(k), str(v)
        if k in df["key"].values:
            df.loc[df["key"] == k, "value"] = v
        else:
            df.loc[len(df)] = [k, v]

    # Rewrite table (small table â†’ safe & fast)
    new_tbl = pa.Table.from_pandas(df, preserve_index=False)
    cfg_tbl.delete("true")   # delete all rows
    cfg_tbl.add(new_tbl)

In [4]:
def open_config_table(db, name):
    # Will throw e if missing 
    return db.open_table(name)

In [5]:
def get_table_names(db):
    t = db.list_tables()
    return list(t)  # works for ListTablesResponse and normal lists

In [6]:
def ensure_tables(
    db_uri: str,
    image_table: str ,
    patch_table: str ,
    dim: int ,
    vec_dtype: str,
):
    db = lancedb.connect(db_uri)

    # dtype = pa.float16() if vec_dtype.lower() in ("fp16", "float16") else pa.float32()
    # vec = pa.list_(dtype)

    # --- Image embeddings schema (1 row per image) ---
    image_schema = pa.schema([
        pa.field("image_id", pa.string()), # same as source image id
    ])

    # --- Patch embeddings schema (1 row per patch) ---
    patch_schema = pa.schema([
        pa.field("patch_id", pa.string()),          # f"{image_id}:{patch_index}"
        pa.field("image_id", pa.string()),   # source image id
        pa.field("patch_index", pa.int32()),
    ])

    # Create if missing 
    tables = get_table_names(db)

    img_tbl = db.create_table(image_table, schema=image_schema, mode="overwrite")
    
    pat_tbl = db.create_table(patch_table, schema=patch_schema, mode="overwrite")

    return db, img_tbl, pat_tbl

In [7]:
PROJECT_ROOT= Path.cwd().parent.parent

# Point to your existing database directory

SOURCE_URI = PROJECT_ROOT / "data" / "lancedb" / "shared_source"
IMG_RAW_TBL_NAME="era5_sample_images"

PROJECT_NAME="dinov3"
AUTHOR = "Cherukuru. N. W"
DB_URI = PROJECT_ROOT / "data" / "lancedb" / "experiments" / "era5"
CONFIG_TBL_NAME=PROJECT_NAME+"_config"
IMG_EMB_TBL_NAME=PROJECT_NAME+"_image_embeddings"
PATCH_EMB_TBL_NAME=PROJECT_NAME+"_patch_embeddings"

EMB_DIM = 768

VEC_DTYPE = "float16"  # or "float32"

MODEL = "vit_large_patch14_dinov2.lvd142m"




In [8]:
CONFIG_TBL_NAME

'dinov3_config'

In [9]:
db, img_tbl, patch_tbl = ensure_tables(
    DB_URI,
    image_table=IMG_EMB_TBL_NAME,
    patch_table=PATCH_EMB_TBL_NAME,
    dim=EMB_DIM,
    vec_dtype=VEC_DTYPE,
)


config_data = [
    {"key": "created_at",       "value": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")},
    {"key": "author",      "value": AUTHOR},
    {"key": "source",      "value": IMG_RAW_TBL_NAME},
    {"key": "source path",      "value": str(SOURCE_URI.relative_to(PROJECT_ROOT))},
    {"key": "tbl_img_emb",      "value": IMG_EMB_TBL_NAME},
    {"key": "tbl_patch_emb",      "value": PATCH_EMB_TBL_NAME}
]


# Note: We let LanceDB infer the simple schema (key: str, value: str) automatically
# by passing the list of dicts directly.
config_table = db.create_table(CONFIG_TBL_NAME, data=config_data, mode="overwrite")
print(f"Table {CONFIG_TBL_NAME} created with global metadata.")

Table dinov3_config created with global metadata.


[90m[[0m2026-01-14T01:39:31Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5/dinov3_image_embeddings.lance, it will be created
[90m[[0m2026-01-14T01:39:31Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5/dinov3_patch_embeddings.lance, it will be created


In [10]:

print("Tables ready:", img_tbl.name, patch_tbl.name, config_table.name)

Tables ready: dinov3_image_embeddings dinov3_patch_embeddings dinov3_config


In [11]:
img_tbl.schema


image_id: string

In [12]:
patch_tbl.schema


patch_id: string
image_id: string
patch_index: int32

In [13]:
config_table.schema
# Convert to Pandas DataFrame for a table print
df_config = config_table.to_pandas()
print(df_config)

             key                       value
0     created_at        2026-01-14T01:39:31Z
1         author             Cherukuru. N. W
2         source          era5_sample_images
3    source path  data/lancedb/shared_source
4    tbl_img_emb     dinov3_image_embeddings
5  tbl_patch_emb     dinov3_patch_embeddings


In [20]:
import sys

!{sys.executable} ./helpers/v3_dino_embeddings_lancedb.py \
  --db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/shared_source \
  --table era5_sample_images \
  --img_id_field id \
  --out_prefix {PROJECT_NAME} \
  --config_db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5 \
  --config_table dinov3_config \
  --model vit_base_patch16_dinov3 \
  --batch 512 \
  --scan_batch 2000 \
  --workers 16


Traceback (most recent call last):
  File [35m"/glade/work/ncheruku/research/bams-ai-data-exploration/notebooks/02-generate-embeddings/./helpers/v3_dino_embeddings_lancedb.py"[0m, line [35m444[0m, in [35m<module>[0m
    [31mmain[0m[1;31m()[0m
    [31m~~~~[0m[1;31m^^[0m
  File [35m"/glade/work/ncheruku/research/bams-ai-data-exploration/notebooks/02-generate-embeddings/./helpers/v3_dino_embeddings_lancedb.py"[0m, line [35m175[0m, in [35mmain[0m
    dummy = torch.zeros(1, 3, target_size, target_size, device=[1;31mdevice[0m, dtype=model_dtype)
                                                               [1;31m^^^^^^[0m
[1;35mUnboundLocalError[0m: [35mcannot access local variable 'device' where it is not associated with a value[0m


In [15]:
# Path where your config DB lives
config_db_path = "/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5"

# Name of your config table
config_table_name = "dinov3_config"

# Connect and open table
db = lancedb.connect(config_db_path)
tbl = db.open_table(config_table_name)

# Load into pandas for easy viewing
df = tbl.to_pandas()

df

Unnamed: 0,key,value
0,created_at,2026-01-14T01:39:31Z
1,author,Cherukuru. N. W
2,source,era5_sample_images
3,source path,data/lancedb/shared_source
4,tbl_img_emb,dinov3_image_embeddings
5,tbl_patch_emb,dinov3_patch_embeddings


In [16]:
df[df.key=="tbl_img_emb"].value.iloc[0]

'dinov3_image_embeddings'

In [17]:
import os
def dir_size_bytes(path: Path) -> int:
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += (Path(root) / f).stat().st_size
    return total

# table_path = db_dir / "era5_sample_images.lance"   


size_bytes = dir_size_bytes("/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5/dinov3_patch_embeddings.lance")

# size_bytes = dir_size_bytes(table_path)


print(f"{size_bytes / 1024**2:.2f} MB")

0.00 MB


In [18]:
db = lancedb.connect(str(DB_URI))
patch_tbl = db.open_table(PATCH_EMB_TBL_NAME)

ValueError: Table 'dinov3_patch_embeddings' was not found

In [None]:
patch_tbl.schema

In [None]:
row = patch_tbl.search().limit(1).to_pandas().iloc[0]
row["image_id"]

In [None]:
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"Is CUDA available? {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")

In [None]:
patch_tbl.create_index(
    metric="cosine",
    index_type="IVF_PQ",
    num_partitions=256,
    num_sub_vectors=96,
    accelerator="cuda",  # GPU acceleration requires PyTorch > 2.0
    vector_column_name="embedding"
)