In [1]:
import json
from datetime import datetime, timezone, UTC

import lancedb
import pyarrow as pa
from pathlib import Path

In [2]:
def fixed_vec_type(dim: int, dtype):
    # Compatible with older pyarrow versions
    return pa.list_(dtype, list_size=dim)

In [3]:
def upsert_config_metadata(cfg_tbl, kv: dict):
    # Load existing config into memory
    df = cfg_tbl.to_pandas()

    # Update / insert keys
    for k, v in kv.items():
        k, v = str(k), str(v)
        if k in df["key"].values:
            df.loc[df["key"] == k, "value"] = v
        else:
            df.loc[len(df)] = [k, v]

    # Rewrite table (small table â†’ safe & fast)
    new_tbl = pa.Table.from_pandas(df, preserve_index=False)
    cfg_tbl.delete("true")   # delete all rows
    cfg_tbl.add(new_tbl)

In [4]:
def open_config_table(db, name):
    # Will throw e if missing 
    return db.open_table(name)

In [5]:
def ensure_tables(
    db_uri: str,
    image_table: str ,
    patch_table: str ,
    dim: int ,
    vec_dtype: str,
):
    db = lancedb.connect(db_uri)

    dtype = pa.float16() if vec_dtype.lower() in ("fp16", "float16") else pa.float32()
    vec = fixed_vec_type(dim, dtype=dtype)

    # --- Image embeddings schema (1 row per image) ---
    image_schema = pa.schema([
        pa.field("image_id", pa.string()),
        pa.field("vector", vec),
        pa.field("pooling", pa.string()),           # "cls"
        pa.field("model", pa.string()),             # timm model name
    ])

    # --- Patch embeddings schema (1 row per patch) ---
    patch_schema = pa.schema([
        pa.field("patch_id", pa.string()),          # f"{image_id}:{patch_index}"
        pa.field("image_id", pa.string()),   # source image id
        pa.field("patch_index", pa.int32()),
        pa.field("vector", vec),
        pa.field("grid_w", pa.int16()),            
        pa.field("grid_h", pa.int16()),
        pa.field("model", pa.string()),
    ])

    # Create if missing 
    if image_table in db.table_names():
        db.drop_table(image_table)
    img_tbl = db.create_table(image_table, schema=image_schema)

    if patch_table in db.table_names():
        db.drop_table(patch_table)
    pat_tbl = db.create_table(patch_table, schema=patch_schema)

    return db, img_tbl, pat_tbl

In [6]:
PROJECT_ROOT= Path.cwd().parent.parent

# Point to your existing database directory
DB_URI = PROJECT_ROOT / "data" / "lancedb" / "experiments" / "era5"

SOURCE_URI = PROJECT_ROOT / "data" / "lancedb" / "shared_source"

CONFIG_TBL_NAME="_config_dinov3"

IMG_RAW_TBL_NAME="era5_sample_images"

IMG_EMB_TBL_NAME="image_embeddings_dinov3"

PATCH_EMB_TBL_NAME="patch_embeddings_dinov3"

EMB_DIM = 768

VEC_DTYPE = "float16"  # or "float32"

MODEL = "vit_large_patch14_dinov2.lvd142m"




In [7]:
db, img_tbl, patch_tbl = ensure_tables(
    DB_URI,
    image_table=IMG_EMB_TBL_NAME,
    patch_table=PATCH_EMB_TBL_NAME,
    dim=EMB_DIM,
    vec_dtype=VEC_DTYPE,
)


config_data = [
    {"key": "created_at",       "value": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")},
    {"key": "author",           "value": "cherukuru"},
    {"key": "project",          "value": "era5 data"},
    {"key": "image_width",      "value": "224"},
    {"key": "image_height",     "value": "224"},
    {"key": "thumb_width",      "value": "64"},
    {"key": "height",           "value": "64"},
    {"key": "source",      "value": IMG_RAW_TBL_NAME},
    {"key": "source path",      "value": str(SOURCE_URI.relative_to(PROJECT_ROOT))},
    {"key": "tbl_img_emb",      "value": IMG_EMB_TBL_NAME},
    {"key": "tbl_patch_emb",      "value": PATCH_EMB_TBL_NAME}
]

# Create the config table
# We use overwrite mode to ensure we don't have stale configs if re-running
if CONFIG_TBL_NAME in db.table_names():
    db.drop_table(CONFIG_TBL_NAME)

# Note: We let LanceDB infer the simple schema (key: str, value: str) automatically
# by passing the list of dicts directly.
config_table = db.create_table(CONFIG_TBL_NAME, data=config_data)
print(f"Table {CONFIG_TBL_NAME} created with global metadata.")

Table _config_dinov3 created with global metadata.


In [8]:

print("Tables ready:", img_tbl.name, patch_tbl.name, config_table.name)

Tables ready: image_embeddings_dinov3 patch_embeddings_dinov3 _config_dinov3


In [9]:
img_tbl.schema


image_id: string
vector: fixed_size_list<item: halffloat>[768]
  child 0, item: halffloat
pooling: string
model: string

In [10]:
patch_tbl.schema


patch_id: string
image_id: string
patch_index: int32
vector: fixed_size_list<item: halffloat>[768]
  child 0, item: halffloat
grid_w: int16
grid_h: int16
model: string

In [11]:
config_table.schema
# Convert to Pandas DataFrame for a table print
df_config = config_table.to_pandas()
print(df_config)

              key                       value
0      created_at        2026-01-01T00:11:32Z
1          author                   cherukuru
2         project                   era5 data
3     image_width                         224
4    image_height                         224
5     thumb_width                          64
6          height                          64
7          source          era5_sample_images
8     source path  data/lancedb/shared_source
9     tbl_img_emb     image_embeddings_dinov3
10  tbl_patch_emb     patch_embeddings_dinov3
