In [None]:
# !pip install -q sentence-transformers torch tqdm pandas numpy faiss-cpu


In [None]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
trainDF = pd.read_csv(
    "data/finalData.csv",
    engine="python",          
    on_bad_lines="skip",      
    encoding="utf-8",
)

trainDF["ascii_name_clean"] = (
    trainDF["ascii_name_clean"]
    .astype(str)
    .str.lower()
    .str.replace(r"[\(\)\[\],\-]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

(387828, 2)


In [3]:
print("Training Dataframe shape:", trainDF.shape)
display("Head of the Train Dataframe", trainDF.head())

Training Dataframe shape: (387828, 2)


'Head of the Train Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,3,r 3 hydroxybutanoyl n 2
1,7,s + 3 carene
2,7,1s 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene
3,7,+ 3 carene
4,7,1s 6r 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene


In [4]:
testDF = pd.read_csv("data/testData.csv")

In [5]:
print("Training Dataframe shape:", testDF.shape)
display("Head of the Test Dataframe", testDF.head())

Training Dataframe shape: (25000, 2)


'Head of the Test Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,7,1alpha 6alpha car 3 ene
1,20,1r 4s 2 2 dimethyl 3 methylenebicyclo 2.2.1 he...
2,31,+ menthone
3,38,+ o methylthalicberine
4,39,dextropimaricacid


In [None]:
#NOTE Untuned models
MODEL_NAME = "all-mpnet-base-v2" 
#MODEL_NAME = "all-MiniLM-L6-v2" 

model = SentenceTransformer(
    MODEL_NAME,
    tokenizer_kwargs={"model_max_length": 32}  
)

#NOTE loading and eval tuned models
# MODEL_PATH = ""
# model = SentenceTransformer(MODEL_PATH)
# model.max_seq_length = 32

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
train_names = trainDF["ascii_name_clean"].astype(str).tolist()

train_embeddings = model.encode(
    train_names,
    batch_size=512,                 
    show_progress_bar=True,
    normalize_embeddings=True        
).astype("float32")

In [None]:
dim = train_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)   
index.add(train_embeddings)
print("FAISS index size:", index.ntotal)

In [None]:
test_names = testDF["ascii_name_clean"].astype(str).tolist()

test_embeddings = model.encode(
    test_names,
    batch_size=512,
    show_progress_bar=True,
    normalize_embeddings=True
).astype("float32")

print("Test embeddings:", test_embeddings.shape)

In [None]:
def recall_at_k_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    k=5,buffer=10 ): 

    hits, total = 0, 0

    _, I = index.search(test_embeddings, k + buffer)

    for i in tqdm(range(len(test_names)), desc=f"Recall@{k}"):
        true_cid = test_cids[i]

        for j in I[i]:
            if train_names[j] == test_names[i]:
                continue

            if train_cids[j] == true_cid:
                hits += 1
                break

        total += 1

    return hits / total


In [None]:
def precision_at_k_and_mrr_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    ks=(1, 5, 10), buffer=20
):
    max_k, precisions = max(ks), {k: [] for k in ks}
    reciprocal_ranks = []

    _, I = index.search(test_embeddings, max_k + buffer)

    for i in tqdm(range(len(test_names)), desc="Precision/MRR"):
        true_cid = test_cids[i]
        query_name = test_names[i]

        ranked = []
        for j in I[i]:
            if train_names[j] == query_name:
                continue
            ranked.append(j)
            if len(ranked) >= max_k:
                break

        for k in ks:
            topk = ranked[:k]
            rel = sum(train_cids[j] == true_cid for j in topk)
            precisions[k].append(rel / k)

        rr = 0.0
        for rank, j in enumerate(ranked, start=1):
            if train_cids[j] == true_cid:
                rr = 1.0 / rank
                break
        reciprocal_ranks.append(rr)

    mean_precisions = {k: np.mean(v) for k, v in precisions.items()}
    mrr = np.mean(reciprocal_ranks)

    return mean_precisions, mrr

In [None]:
train_cids = trainDF["compound_id"].values
test_cids  = testDF["compound_id"].values

In [None]:
recall_1 = recall_at_k_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    k=1
)

recall_5 = recall_at_k_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    k=5
)

recall_10 = recall_at_k_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    k=10
)

print("")
print(f"Recall@1:  {recall_1:.4f}")
print(f"Recall@5:  {recall_5:.4f}")
print(f"Recall@10: {recall_10:.4f}")

In [None]:
precisions, mrr = precision_at_k_and_mrr_faiss(
    index, test_embeddings,
    train_names, train_cids,
    test_names, test_cids,
    ks=(1, 5, 10)
)

print(f"Precision@1:  {precisions[1]:.4f}")
print(f"Precision@5:  {precisions[5]:.4f}")
print(f"Precision@10: {precisions[10]:.4f}")
print(f"MRR:          {mrr:.4f}")