In [1]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('PubChemLite_CCSbase_20251128.csv')

In [3]:
df.columns

Index(['Identifier', 'FirstBlock', 'PubMed_Count', 'Patent_Count',
       'Related_CIDs', 'Synonym', 'MolecularFormula', 'SMILES', 'InChI',
       'InChIKey', 'MonoisotopicMass', 'XLogP', 'CompoundName',
       'AnnoTypeCount', 'AgroChemInfo', 'BioPathway', 'DrugMedicInfo',
       'FoodRelated', 'PharmacoInfo', 'SafetyInfo', 'ToxicityInfo', 'KnownUse',
       'DisorderDisease', 'Identification', 'ChemClass', 'pred_CCS_A2_[M+H]+',
       'pred_CCS_A2_[M+Na]+', 'pred_CCS_A2_[M-H]-', 'pred_CCS_A2_[M+NH4]+',
       'pred_CCS_A2_[M+K]+', 'pred_CCS_A2_[M+H-H2O]+', 'pred_CCS_A2_[M+HCOO]-',
       'pred_CCS_A2_[M+CH3COO]-', 'pred_CCS_A2_[M+Na-2H]-', 'pred_CCS_A2_[M]+',
       'pred_CCS_A2_[M]-'],
      dtype='object')

In [4]:
df[df["Identifier"] == 10294]

Unnamed: 0,Identifier,FirstBlock,PubMed_Count,Patent_Count,Related_CIDs,Synonym,MolecularFormula,SMILES,InChI,InChIKey,...,pred_CCS_A2_[M+Na]+,pred_CCS_A2_[M-H]-,pred_CCS_A2_[M+NH4]+,pred_CCS_A2_[M+K]+,pred_CCS_A2_[M+H-H2O]+,pred_CCS_A2_[M+HCOO]-,pred_CCS_A2_[M+CH3COO]-,pred_CCS_A2_[M+Na-2H]-,pred_CCS_A2_[M]+,pred_CCS_A2_[M]-
5585,10294,JSYUKXVWRPINHC,0,1150,23690449 25203568,Salirgan; Salurin; Salyrgan,C13H16HgNO5,COC(CNC(=O)C1=CC=CC=C1OCC(=O)O)C[Hg],InChI=1S/C13H16NO5.Hg/c1-9(18-2)7-14-13(17)10-...,JSYUKXVWRPINHC-UHFFFAOYSA-N,...,204.0,201.5,213.7,202.2,192.1,220.0,208.0,198.5,204.6,204.6


In [5]:
syn_counts = (
    df["Synonym"]
    .fillna("")
    .astype(str)
    .str.split(";")
    .apply(lambda x: len([s for s in x if s.strip()]))
)

num_multi_name = (syn_counts > 1).sum()
print("Molecules with multiple names:", num_multi_name)

df_multi = df[syn_counts > 1]

rows = []

for _, row in df_multi.iterrows():
    cid = row["Identifier"]   # don't force int unless required
    synonyms = str(row["Synonym"]).split(";")
    
    for name in synonyms:
        name = name.strip().lower()
        if name:
            rows.append((name, cid))

name_df = pd.DataFrame(rows, columns=["name", "cid"])
print("Total names (only multi-synonym molecules):", len(name_df))
name_df.head()

Molecules with multiple names: 216
Total names (only multi-synonym molecules): 497


Unnamed: 0,name,cid
0,salirgan,10294
1,salurin,10294
2,salyrgan,10294
3,chloro(methyl)palladium,12378
4,cyclooctane,12378


In [7]:
pairs = []
labels = []
cid_groups = name_df.groupby("cid")["name"].apply(list)

for names in cid_groups:
    if len(names) > 1:
        for i in range(min(3, len(names)-1)):
            a, b = random.sample(names, 2)
            pairs.append((a, b))
            labels.append(1)

all_names = name_df["name"].tolist()
for _ in range(len(pairs)):
    a, b = random.sample(all_names, 2)
    pairs.append((a, b))
    labels.append(0)

print("Training pairs:", len(pairs))

Training pairs: 562


In [8]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
train_examples = [
    InputExample(texts=[a, b], label=float(label))
    for (a, b), label in zip(pairs, labels)
]

train_dataloader = DataLoader(train_examples, batch_size=64, shuffle=True)
train_loss = losses.ContrastiveLoss(model)

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100)



Step,Training Loss


In [10]:
dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatIP(dim)   
print("Flat index ready")

Flat index ready


In [11]:
batch_size = 1024
names = name_df["name"].tolist()

for i in tqdm(range(0, len(names), batch_size)):
    batch_names = names[i:i+batch_size]
    batch_embeddings = model.encode(
        batch_names,
        batch_size=256,
        show_progress_bar=False
    )
    
    batch_embeddings = np.array(batch_embeddings).astype("float32")
    index.add(batch_embeddings)

100%|██████████| 1/1 [00:36<00:00, 36.26s/it]


In [12]:
def synonymize(query, k=5):
    query = query.lower()
    qvec = model.encode([query]).astype("float32")
    distances, idxs = index.search(qvec, k)
    results = []
    
    for i in idxs[0]:
        results.append((
            name_df.iloc[i]["name"],
            name_df.iloc[i]["cid"]
        ))
    return results

synonymize("salirgan", k=5)

[('salirgan', 10294),
 ('salyrgan', 10294),
 ('salurin', 10294),
 ('dichloromanganese', 129907266),
 ('dichloromanganese', 122388791)]

In [13]:
def evaluate_accuracy(k=5, trials=1000):
    correct = 0
    
    for _ in range(trials):
        row = name_df.sample(1).iloc[0]
        query = row["name"]
        true_cid = row["cid"]
        results = synonymize(query, k)
        retrieved_cids = [cid for _, cid in results]
        
        if true_cid in retrieved_cids:
            correct += 1
    
    return correct / trials

for k in [1, 3, 5, 10]:
    print(f"Top-{k} Accuracy:", evaluate_accuracy(k))

Top-1 Accuracy: 0.616
Top-3 Accuracy: 0.75
Top-5 Accuracy: 0.792
Top-10 Accuracy: 0.895
