In [1]:
import torch
import torch.nn.functional as F

def load_embeddings(path):
    data = torch.load(path, map_location="cpu")
    embeddings = data["embeddings"]  # shape (N, D)
    db_id_to_index = data["db_id_to_index"]  # mapping db_id -> index
    return embeddings, db_id_to_index

def cosine_similarity_between_ids(embedding_path, db_id1, db_id2):
    embeddings, db_id_to_index = load_embeddings(embedding_path)

    if db_id1 not in db_id_to_index or db_id2 not in db_id_to_index:
        raise ValueError(f"One or both db_ids not found: {db_id1}, {db_id2}")

    idx1 = db_id_to_index[db_id1]
    idx2 = db_id_to_index[db_id2]

    emb1 = embeddings[idx1].unsqueeze(0)  # shape (1, D)
    emb2 = embeddings[idx2].unsqueeze(0)  # shape (1, D)

    similarity = F.cosine_similarity(emb1, emb2, dim=1).item()
    return similarity


In [3]:
embedding_path = "/hpctmp/e1351271/wkdbs/out/col_matcher_bge-m3_database/weights/finetuned_bge_m3_softmax_lr1e-05/embeddings/all_embeddings.pt"

db_id1 = "35179"
db_id2 = "80846"

similarity = cosine_similarity_between_ids(embedding_path, db_id1, db_id2)
print(f"Cosine similarity between {db_id1} and {db_id2}: {similarity:.4f}")


Cosine similarity between 35179 and 80846: -0.1363
