In [13]:
from rapidfuzz.distance import Levenshtein
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:
trainDF = pd.read_csv(
    "data/finalData.csv",
    engine="python",         
    on_bad_lines="skip", 
    encoding="utf-8",
)

trainDF["ascii_name_clean"] = (
    trainDF["ascii_name_clean"]
    .astype(str)
    .str.lower()
    .str.replace(r"[\(\)\[\],\-]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

In [15]:
print("Training Dataframe shape:", trainDF.shape)
display("Head of the Train Dataframe", trainDF.head())

Training Dataframe shape: (387828, 2)


'Head of the Train Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,3,r 3 hydroxybutanoyl n 2
1,7,s + 3 carene
2,7,1s 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene
3,7,+ 3 carene
4,7,1s 6r 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene


In [16]:
testDF = pd.read_csv("data/testData.csv")

In [17]:
print("Training Dataframe shape:", testDF.shape)
display("Head of the Test Dataframe", testDF.head())

Training Dataframe shape: (25000, 2)


'Head of the Test Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,7,1alpha 6alpha car 3 ene
1,20,1r 4s 2 2 dimethyl 3 methylenebicyclo 2.2.1 he...
2,31,+ menthone
3,38,+ o methylthalicberine
4,39,dextropimaricacid


In [None]:
def edit_distance_recall_at_k_train_test(
    train_names, train_cids,
    test_names, test_cids,
    k=5, max_candidates=4000,
    length_window=5
):
    hits, total = 0, 0

    for i in tqdm(range(len(test_names)), desc="Edit-distance eval"):
        q, qlen, target_cid = test_names[i], len(q), test_cids[i]
  
        candidates = [
            j for j, name in enumerate(train_names)
            if abs(len(name) - qlen) <= length_window
        ]

        if len(candidates) > max_candidates:
            candidates = np.random.choice(
                candidates, max_candidates, replace=False
            )

        scored = []
        for j in candidates:
            if train_names[j] == q:
                continue
            dist = Levenshtein.distance(q, train_names[j])
            scored.append((j, dist))

        topk = sorted(scored, key=lambda x: x[1])[:k]

        if any(train_cids[j] == target_cid for j, _ in topk):
            hits += 1

        total += 1

    return hits / total if total > 0 else 0.0

In [21]:
def edit_distance_ranked_candidates(
    query, train_names,
    max_candidates=4000, length_window=5
):
    qlen, scored = len(query), []

    candidates = [
        j for j, name in enumerate(train_names)
        if abs(len(name) - qlen) <= length_window
    ]

    if len(candidates) > max_candidates:
        candidates = np.random.choice(
            candidates, max_candidates, replace=False
        )

    for j in candidates:
        if train_names[j] == query:
            continue  
        dist = Levenshtein.distance(query, train_names[j])
        scored.append((j, dist))

    scored.sort(key=lambda x: x[1])
    return scored

In [None]:
def precision_at_k_edit_distance(
    train_names, train_cids,
    test_names, test_cids,
    k=5
):
    precisions = []

    for i in tqdm(range(len(test_names)), desc=f"P@{k}"):
        ranked = edit_distance_ranked_candidates(
            test_names[i],
            train_names
        )

        if not ranked:
            precisions.append(0.0)
            continue

        topk, target_cid = ranked[:k], test_cids[i]

        rel = sum(
            1 for j, _ in topk
            if train_cids[j] == target_cid
        )

        precisions.append(rel / k)

    return float(np.mean(precisions))

In [None]:
def mrr_edit_distance(
    train_names, train_cids,
    test_names, test_cids
):
    rr = []

    for i in tqdm(range(len(test_names)), desc="MRR"):
        ranked = edit_distance_ranked_candidates(
            test_names[i],
            train_names
        )

        target_cid, reciprocal = test_cids[i], 0.0

        for rank, (j, _) in enumerate(ranked, start=1):
            if train_cids[j] == target_cid:
                reciprocal = 1.0 / rank
                break

        rr.append(reciprocal)

    return float(np.mean(rr))

In [None]:
train_names = trainDF["ascii_name_clean"].astype(str).tolist()
train_cids  = trainDF["compound_id"].values
test_names  = testDF["ascii_name_clean"].astype(str).tolist()
test_cids   = testDF["compound_id"].values

In [20]:
for k in [1, 5, 10]:
    r = edit_distance_recall_at_k_train_test(
        train_names, train_cids,
        test_names, test_cids,
        k=k
    )
    
    print(f"Edit distance Recall@{k}: {r:.4f}")

Edit-distance eval: 100%|██████████| 25000/25000 [33:14<00:00, 12.53it/s]  


Edit distance Recall@1: 0.0413


Edit-distance eval: 100%|██████████| 25000/25000 [32:33<00:00, 12.80it/s] 


Edit distance Recall@5: 0.0608


Edit-distance eval: 100%|██████████| 25000/25000 [36:43<00:00, 11.35it/s] 

Edit distance Recall@10: 0.0653





In [None]:
print("Edit-distance baseline metrics:\n")

print("MRR:", mrr_edit_distance(
    train_names, train_cids,
    test_names, test_cids
))

for k in [1, 5, 10]:
    p = precision_at_k_edit_distance(
        train_names, train_cids,
        test_names, test_cids,
        k=k
    )
    
    print(f"Precision@{k}: {p:.4f}")

Edit-distance baseline metrics:



MRR: 100%|██████████| 25000/25000 [51:09<00:00,  8.14it/s]  


MRR: 0.05108707899510577


P@1: 100%|██████████| 25000/25000 [1:20:57<00:00,  5.15it/s]


Precision@1: 0.0425


P@5: 100%|██████████| 25000/25000 [32:55<00:00, 12.65it/s]


Precision@5: 0.0133


P@10: 100%|██████████| 25000/25000 [45:49<00:00,  9.09it/s]  

Precision@10: 0.0068



