In [10]:
import warnings
import os
import json
from transformers import AutoTokenizer, AutoModel
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
other_models = [x for x in os.listdir('ChemRxivRetrieval') if not x.startswith('run')]

In [3]:
closed_sources = [
    'bedrock__cohere-embed-english-v3',
    'bedrock__amazon-titan-embed-text-v1',
    'bedrock__amazon-titan-embed-text-v2',
    'bedrock__cohere-embed-multilingual-v3',
    'openai__text-embedding-3-large',
    'openai__text-embedding-ada-002',
    'openai__text-embedding-3-small',
]

In [4]:
def read_model_meta(base_dir, model_name):
    rev = os.path.join(base_dir, model_name, model_name)
    rev = os.listdir(rev)[0]
    json_path = os.path.join(base_dir, model_name, model_name, rev, 'model_meta.json')
    with open(json_path, 'r') as f:
        meta = json.load(f)
    return meta

In [5]:
read_model_meta('ChemRxivRetrieval', 'nomic-ai__nomic-embed-text-v1')

{'name': 'nomic-ai__nomic-embed-text-v1',
 'revision': 'no_revision_available',
 'release_date': None,
 'languages': ['eng-Latn'],
 'n_parameters': None,
 'memory_usage_mb': None,
 'max_tokens': 8192.0,
 'embed_dim': 768,
 'license': 'cc-by-nc-4.0',
 'open_weights': True,
 'public_training_code': None,
 'public_training_data': None,
 'framework': ['Sentence Transformers', 'PyTorch'],
 'reference': None,
 'similarity_fn_name': 'cosine',
 'use_instructions': True,
 'training_datasets': None,
 'adapted_from': None,
 'superseded_by': None,
 'is_cross_encoder': None,
 'modalities': ['text'],
 'loader': 'BiEncoderWrapper'}

Find models without embedding dims present in their model_meta.json

In [6]:
no_embed_dim_models = []
for model in other_models:
    meta = read_model_meta('ChemRxivRetrieval', model)
    if meta['embed_dim'] is None:
        no_embed_dim_models.append(model)
        print(model)

In [7]:
def get_embed_dim(model_name: str) -> int:
    """
    Loads a Hugging Face model and returns its embedding dimension, with GPU support
    and special handling for certain Sentence-Transformers repos.

    1. Replaces '__' with '/' in the model name.
    2. If the base name matches a known Sentence-Transformer, prefix with 'sentence-transformers/'.
    3. Attempts to load the corresponding tokenizer; falls back to 'bert-base-uncased' if unavailable.
    4. Loads the model (using GPU if available).
       - For repos requiring remote code (e.g. nomic-ai), sets trust_remote_code=True.
    5. Tokenizes a dummy string and runs a forward pass on the correct device.
    6. Returns the hidden-state size (embedding dimension) as an integer.
    """
    SENTENCE_TRANSFORMER_MODELS = {
        "all-MiniLM-L6-v2",
        "all-mpnet-base-v2",
        "multi-qa-mpnet-base-dot-v1",
        "all-MiniLM-L12-v2",
    }

    # Normalize repository path
    repo = model_name.replace("__", "/")
    base = repo.split("/")[-1]

    # Handle Sentence-Transformer shortcuts
    if base in SENTENCE_TRANSFORMER_MODELS:
        repo = f"sentence-transformers/{base}"

    # Detect device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load tokenizer with fallback
    try:
        tokenizer = AutoTokenizer.from_pretrained(repo)
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Prepare dummy input
    inputs = tokenizer("This is a dummy string", return_tensors="pt").to(device)

    # Load model, with remote code trust if needed
    trust_remote = repo.startswith("nomic-ai/")
    model = AutoModel.from_pretrained(repo, trust_remote_code=trust_remote).to(device)

    # Forward pass
    outputs = model(**inputs)

    # Extract embedding dimension
    embed_dim = outputs.last_hidden_state.shape[-1]
    return int(embed_dim)

In [8]:
for model in no_embed_dim_models:
    try:
        dim = get_embed_dim(model)
        print(f"Embed dimension for '{model}': {dim}")
        print(30 * '*')
    except Exception as e:
        print(f"Error for '{model}': {e}")
        print(30 * '*')

In [19]:
embed_dim_updated = {
  "answerdotai__ModernBERT-large": 1024,
  "BAAI__bge-large-en-v1.5": 1024,
  "BAAI__bge-base-en-v1.5": 768,
  "all-MiniLM-L6-v2": 384,
  "all-mpnet-base-v2": 768,
  "multi-qa-mpnet-base-dot-v1": 768,
  "nomic-ai__nomic-embed-text-v2-moe": 768,
  "answerdotai__ModernBERT-base": 768,
  "BAAI__bge-small-en": 384,
  "all-MiniLM-L12-v2": 384,
  "nomic-ai__nomic-bert-2048": 768,
  "BAAI__bge-base-en": 768,
  "recobo__chemical-bert-uncased": 768,
  "google-bert__bert-base-uncased": 768,
  "BAAI__bge-large-en": 1024,
  "allenai__scibert_scivocab_uncased": 768,
  "BAAI__bge-small-en-v1.5": 384,
  'm3rg-iitd__matscibert': 768
}

In [20]:
assert len(embed_dim_updated) == len(no_embed_dim_models)

In [12]:
def update_model_meta(base_dir, model_name, key_to_update, new_value):
    """
    Reads the model_meta.json for the given model,
    updates a specific key with new_value,
    and writes the updated JSON back without modifying other entries.
    """
    # Determine the revision directory
    model_root = os.path.join(base_dir, model_name, model_name)
    rev = os.listdir(model_root)[0]
    json_path = os.path.join(model_root, rev, 'model_meta.json')

    # Read existing metadata
    with open(json_path, 'r') as f:
        meta = json.load(f)

    # Update the specified key
    meta[key_to_update] = new_value

    # Write it back
    with open(json_path, 'w') as f:
        json.dump(meta, f, indent=2)

    print(f"Updated '{key_to_update}' to '{new_value}' in {json_path}")

In [22]:
for model in embed_dim_updated:
    update_model_meta('ChemRxivRetrieval', model, 'embed_dim', embed_dim_updated[model])

Updated 'embed_dim' to '1024' in ChemRxivRetrieval/answerdotai__ModernBERT-large/answerdotai__ModernBERT-large/45bb4654a4d5aaff24dd11d4781fa46d39bf8c13/model_meta.json
Updated 'embed_dim' to '1024' in ChemRxivRetrieval/BAAI__bge-large-en-v1.5/BAAI__bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/model_meta.json
Updated 'embed_dim' to '768' in ChemRxivRetrieval/BAAI__bge-base-en-v1.5/BAAI__bge-base-en-v1.5/a5beb1e3e68b9ab74eb54cfd186867f64f240e1a/model_meta.json
Updated 'embed_dim' to '384' in ChemRxivRetrieval/all-MiniLM-L6-v2/all-MiniLM-L6-v2/8b3219a92973c328a8e22fadcfa821b5dc75636a/model_meta.json
Updated 'embed_dim' to '768' in ChemRxivRetrieval/all-mpnet-base-v2/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_meta.json
Updated 'embed_dim' to '768' in ChemRxivRetrieval/multi-qa-mpnet-base-dot-v1/multi-qa-mpnet-base-dot-v1/3af7c6da5b3e1bea796ef6c97fe237538cbe6e7f/model_meta.json
Updated 'embed_dim' to '768' in ChemRxivRetrieval/nomic-ai__nomic-embed-text-v

### Num Parameters

In [13]:
no_n_param_models = []
for model in other_models:
    meta = read_model_meta('ChemRxivRetrieval', model)
    if meta['n_parameters'] is None and model not in closed_sources:
        no_n_param_models.append(model)
        print(model)

m3rg-iitd__matscibert
answerdotai__ModernBERT-large
BAAI__bge-large-en-v1.5
BAAI__bge-base-en-v1.5
all-MiniLM-L6-v2
all-mpnet-base-v2
nomic-ai__nomic-embed-text-v1-unsupervised
multi-qa-mpnet-base-dot-v1
nomic-ai__nomic-embed-text-v2-moe
answerdotai__ModernBERT-base
BAAI__bge-small-en
nomic-ai__nomic-embed-text-v1
all-MiniLM-L12-v2
nomic-ai__nomic-bert-2048
BAAI__bge-base-en
recobo__chemical-bert-uncased
google-bert__bert-base-uncased
BAAI__bge-large-en
allenai__scibert_scivocab_uncased
BAAI__bge-small-en-v1.5


In [14]:
def get_num_parameters(model_name: str) -> int:
    """
    Loads a Hugging Face model and returns its total number of parameters.
    
    Steps:
    1. Replace '__' with '/' in the model identifier.
    2. Prefix known Sentence-Transformer models with 'sentence-transformers/'.
    3. Detect and use GPU if available.
    4. Load the model (trusting remote code for nomic-ai repos).
    5. Sum up all parameters.
    
    Args:
        model_name (str): The HF repo name, e.g. 'BAAI__bge-large-en'.
    
    Returns:
        int: Total number of parameters (in millions if `in_millions=True`, else exact).
    """
    SENTENCE_TRANSFORMER_MODELS = {
        "all-MiniLM-L6-v2",
        "all-mpnet-base-v2",
        "multi-qa-mpnet-base-dot-v1",
        "all-MiniLM-L12-v2",
    }
    # Normalize repo path
    repo = model_name.replace("__", "/")
    base = repo.split("/")[-1]
    if base in SENTENCE_TRANSFORMER_MODELS:
        repo = f"sentence-transformers/{base}"

    # Choose device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load model (no tokenizer needed here)
    trust_remote = repo.startswith("nomic-ai/")
    model = AutoModel.from_pretrained(repo, trust_remote_code=trust_remote).to(device)

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    return total_params

In [15]:
for model in no_n_param_models:
    try:
        dim = get_num_parameters(model)
        print(f"Num parameters for '{model}': {dim}")
        print(30 * '*')
    except Exception as e:
        print(f"Error for '{model}': {e}")
        print(30 * '*')

Some weights of BertModel were not initialized from the model checkpoint at m3rg-iitd/matscibert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num parameters for 'm3rg-iitd__matscibert': 109918464
******************************


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Num parameters for 'answerdotai__ModernBERT-large': 394781696
******************************
Num parameters for 'BAAI__bge-large-en-v1.5': 335141888
******************************
Num parameters for 'BAAI__bge-base-en-v1.5': 109482240
******************************
Num parameters for 'all-MiniLM-L6-v2': 22713216
******************************
Num parameters for 'all-mpnet-base-v2': 109486464
******************************


<All keys matched successfully>


Num parameters for 'nomic-ai__nomic-embed-text-v1-unsupervised': 136731648
******************************
Num parameters for 'multi-qa-mpnet-base-dot-v1': 109486464
******************************
Num parameters for 'nomic-ai__nomic-embed-text-v2-moe': 475292928
******************************
Num parameters for 'answerdotai__ModernBERT-base': 149014272
******************************
Num parameters for 'BAAI__bge-small-en': 33360000
******************************


<All keys matched successfully>


Num parameters for 'nomic-ai__nomic-embed-text-v1': 136731648
******************************
Num parameters for 'all-MiniLM-L12-v2': 33360000
******************************


<All keys matched successfully>


Num parameters for 'nomic-ai__nomic-bert-2048': 136731648
******************************


Some weights of BertModel were not initialized from the model checkpoint at recobo/chemical-bert-uncased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num parameters for 'BAAI__bge-base-en': 109482240
******************************
Num parameters for 'recobo__chemical-bert-uncased': 109918464
******************************
Num parameters for 'google-bert__bert-base-uncased': 109482240
******************************
Num parameters for 'BAAI__bge-large-en': 335141888
******************************
Num parameters for 'allenai__scibert_scivocab_uncased': 109918464
******************************
Num parameters for 'BAAI__bge-small-en-v1.5': 33360000
******************************


In [16]:
len(no_n_param_models)

20

In [19]:
n_params_updated = {
    "m3rg-iitd__matscibert": 109918464,
    "answerdotai__ModernBERT-large": 394781696,
    "BAAI__bge-large-en-v1.5": 335141888,
    "BAAI__bge-base-en-v1.5": 109482240,
    "all-MiniLM-L6-v2": 22713216,
    "all-mpnet-base-v2": 109486464,
    "nomic-ai__nomic-embed-text-v1-unsupervised": 136731648,
    "multi-qa-mpnet-base-dot-v1": 109486464,
    "nomic-ai__nomic-embed-text-v2-moe": 475292928,
    "answerdotai__ModernBERT-base": 149014272,
    "BAAI__bge-small-en": 33360000,
    "nomic-ai__nomic-embed-text-v1": 136731648,
    "all-MiniLM-L12-v2": 33360000,
    "nomic-ai__nomic-bert-2048": 136731648,
    "BAAI__bge-base-en": 109482240,
    "recobo__chemical-bert-uncased": 109918464,
    "google-bert__bert-base-uncased": 109482240,
    "BAAI__bge-large-en": 335141888,
    "allenai__scibert_scivocab_uncased": 109918464,
    "BAAI__bge-small-en-v1.5": 33360000
}

In [18]:
assert len(n_params_updated) == len(no_n_param_models)

In [21]:
for model in n_params_updated:
    update_model_meta('ChemRxivRetrieval', model, 'n_parameters', n_params_updated[model])

Updated 'n_parameters' to '109918464' in ChemRxivRetrieval/m3rg-iitd__matscibert/m3rg-iitd__matscibert/ced9d8f5f208712c4a90f98a246fe32155b29995/model_meta.json
Updated 'n_parameters' to '394781696' in ChemRxivRetrieval/answerdotai__ModernBERT-large/answerdotai__ModernBERT-large/45bb4654a4d5aaff24dd11d4781fa46d39bf8c13/model_meta.json
Updated 'n_parameters' to '335141888' in ChemRxivRetrieval/BAAI__bge-large-en-v1.5/BAAI__bge-large-en-v1.5/d4aa6901d3a41ba39fb536a557fa166f842b0e09/model_meta.json
Updated 'n_parameters' to '109482240' in ChemRxivRetrieval/BAAI__bge-base-en-v1.5/BAAI__bge-base-en-v1.5/a5beb1e3e68b9ab74eb54cfd186867f64f240e1a/model_meta.json
Updated 'n_parameters' to '22713216' in ChemRxivRetrieval/all-MiniLM-L6-v2/all-MiniLM-L6-v2/8b3219a92973c328a8e22fadcfa821b5dc75636a/model_meta.json
Updated 'n_parameters' to '109486464' in ChemRxivRetrieval/all-mpnet-base-v2/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_meta.json
Updated 'n_parameters' to '136731648'