In [1]:
from datetime import UTC, datetime
from pathlib import Path

import lancedb
import pyarrow as pa

In [2]:
# def fixed_vec_type(dim: int, dtype):
#     # Compatible with older pyarrow versions
#     return pa.list_(dtype, list_size=dim)

In [3]:
def upsert_config_metadata(cfg_tbl, kv: dict):
    # Load existing config into memory
    df = cfg_tbl.to_pandas()

    # Update / insert keys
    for k, v in kv.items():
        k, v = str(k), str(v)
        if k in df["key"].values:
            df.loc[df["key"] == k, "value"] = v
        else:
            df.loc[len(df)] = [k, v]

    # Rewrite table (small table â†’ safe & fast)
    new_tbl = pa.Table.from_pandas(df, preserve_index=False)
    cfg_tbl.delete("true")  # delete all rows
    cfg_tbl.add(new_tbl)

In [4]:
def open_config_table(db, name):
    # Will throw e if missing
    return db.open_table(name)

In [5]:
def get_table_names(db):
    t = db.list_tables()
    return list(t)  # works for ListTablesResponse and normal lists

In [6]:
def ensure_tables(
    db_uri: str,
    image_table: str,
    patch_table: str,
    dim: int,
    vec_dtype: str,
):
    db = lancedb.connect(db_uri)

    # dtype = pa.float16() if vec_dtype.lower() in ("fp16", "float16") else pa.float32()
    # vec = pa.list_(dtype)

    # --- Image embeddings schema (1 row per image) ---
    image_schema = pa.schema(
        [
            pa.field("image_id", pa.string()),  # same as source image id
        ]
    )

    # --- Patch embeddings schema (1 row per patch) ---
    patch_schema = pa.schema(
        [
            pa.field("patch_id", pa.string()),  # f"{image_id}:{patch_index}"
            pa.field("image_id", pa.string()),  # source image id
            pa.field("patch_index", pa.int32()),
        ]
    )

    # Create if missing
    # tables = get_table_names(db)

    img_tbl = db.create_table(image_table, schema=image_schema, mode="overwrite")

    pat_tbl = db.create_table(patch_table, schema=patch_schema, mode="overwrite")

    return db, img_tbl, pat_tbl

In [7]:
PROJECT_ROOT = Path.cwd().parent.parent

# Point to your existing database directory

SOURCE_URI = PROJECT_ROOT / "data" / "lancedb" / "shared_source"
IMG_RAW_TBL_NAME = "CLEVR_val_cropped"

PROJECT_NAME = "clevr_dinov3"
AUTHOR = "Cherukuru. N. W"
DB_URI = PROJECT_ROOT / "data" / "lancedb" / "experiments" / "era5"
CONFIG_TBL_NAME = PROJECT_NAME + "_config"
IMG_EMB_TBL_NAME = PROJECT_NAME + "_image_embeddings"
PATCH_EMB_TBL_NAME = PROJECT_NAME + "_patch_embeddings"

EMB_DIM = 768

VEC_DTYPE = "float16"  # or "float32"

MODEL = "vit_large_patch14_dinov2.lvd142m"

In [8]:
CONFIG_TBL_NAME

'clevr_dinov3_config'

In [9]:
db, img_tbl, patch_tbl = ensure_tables(
    DB_URI,
    image_table=IMG_EMB_TBL_NAME,
    patch_table=PATCH_EMB_TBL_NAME,
    dim=EMB_DIM,
    vec_dtype=VEC_DTYPE,
)


config_data = [
    {"key": "created_at", "value": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")},
    {"key": "author", "value": AUTHOR},
    {"key": "source", "value": IMG_RAW_TBL_NAME},
    {"key": "source path", "value": str(SOURCE_URI.relative_to(PROJECT_ROOT))},
    {"key": "tbl_img_emb", "value": IMG_EMB_TBL_NAME},
    {"key": "tbl_patch_emb", "value": PATCH_EMB_TBL_NAME},
]


# Note: We let LanceDB infer the simple schema (key: str, value: str) automatically
# by passing the list of dicts directly.
config_table = db.create_table(CONFIG_TBL_NAME, data=config_data, mode="overwrite")
print(f"Table {CONFIG_TBL_NAME} created with global metadata.")

Table clevr_dinov3_config created with global metadata.


In [10]:
print("Tables ready:", img_tbl.name, patch_tbl.name, config_table.name)

Tables ready: clevr_dinov3_image_embeddings clevr_dinov3_patch_embeddings clevr_dinov3_config


In [11]:
img_tbl.schema

image_id: string

In [12]:
patch_tbl.schema

patch_id: string
image_id: string
patch_index: int32

In [13]:
config_table.schema
# Convert to Pandas DataFrame for a table print
df_config = config_table.to_pandas()
print(df_config)

             key                          value
0     created_at           2026-02-12T05:08:28Z
1         author                Cherukuru. N. W
2         source              CLEVR_val_cropped
3    source path     data/lancedb/shared_source
4    tbl_img_emb  clevr_dinov3_image_embeddings
5  tbl_patch_emb  clevr_dinov3_patch_embeddings


In [14]:
!{sys.executable} ./helpers/v3_dino_embeddings_lancedb.py \
  --db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/shared_source \
  --table CLEVR_val_cropped \
  --img_id_field id \
  --out_prefix {PROJECT_NAME} \
  --config_db /glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/clevr \
  --config_table clevr_dinov3_config \
  --model vit_base_patch16_dinov3 \
  --batch 512 \
  --scan_batch 2000 \
  --workers 16



Processed=1628  Skipped(missing_blob)=0  Skipped(decode_fail)=0

Done.
- run_id: d9f3d571-d167-406e-8007-70e42cb678d9
- processed: 1628
- skipped_blob: 0
- skipped_decode: 0
- device: cuda
- dtype_used: fp16
- image_size: 256
- patch_size: 16
- tokens_total: 261
- img_emb_table: clevr_dinov3_image_embeddings
- patch_emb_table: clevr_dinov3_patch_embeddings
- config_table: clevr_dinov3_config


In [15]:
# Path where your config DB lives
config_db_path = "/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/clevr"

# Name of your config table
config_table_name = "clevr_dinov3_config"

# Connect and open table
db = lancedb.connect(config_db_path)
tbl = db.open_table(config_table_name)

# Load into pandas for easy viewing
df = tbl.to_pandas()

df

Unnamed: 0,key,value
0,created_at,2026-02-12T05:09:38Z
1,run_id,d9f3d571-d167-406e-8007-70e42cb678d9
2,raw_db_uri,/glade/work/ncheruku/research/bams-ai-data-exp...
3,raw_table,CLEVR_val_cropped
4,raw_img_id_field,id
5,raw_img_blob_field,image_blob
6,model_name,vit_base_patch16_dinov3
7,image_size_used,256
8,patch_size,16
9,embedding_dim,768


In [16]:
df[df.key == "img_emb_table_current"].value.iloc[0]

'clevr_dinov3_image_embeddings'

In [17]:
import os


def dir_size_bytes(path: Path) -> int:
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += (Path(root) / f).stat().st_size
    return total


# table_path = db_dir / "era5_sample_images.lance"


size_bytes = dir_size_bytes("/glade/work/ncheruku/research/bams-ai-data-exploration/data/lancedb/experiments/era5/dinov3_patch_embeddings.lance")

# size_bytes = dir_size_bytes(table_path)


print(f"{size_bytes / 1024**2:.2f} MB")

872.92 MB


In [18]:
db = lancedb.connect(str(DB_URI))
patch_tbl = db.open_table(PATCH_EMB_TBL_NAME)

In [19]:
patch_tbl.schema

patch_id: string
image_id: string
patch_index: int32

In [21]:
row = patch_tbl.search().limit(1).to_pandas().iloc[0]
row["image_id"]

# # Check the total number of rows in the table
# print(len(patch_tbl))

# # OR check the dataframe directly
# df = patch_tbl.search().limit(1).to_pandas()
# print(df.empty)  # This will likely print 'True'


IndexError: single positional indexer is out-of-bounds

In [29]:
import torch

print(f"PyTorch Version: {torch.__version__}")
print(f"Is CUDA available? {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")

PyTorch Version: 2.9.1+cu128
Is CUDA available? True
CUDA Version: 12.8


In [None]:
patch_tbl.create_index(metric="cosine", index_type="IVF_PQ", num_partitions=128, num_sub_vectors=96, accelerator="cuda", vector_column_name="embedding")