In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from tqdm import tqdm
import pandas as pd
import numpy as np
import faiss

In [None]:
trainDF = pd.read_csv(
    "data/finalData.csv",
    engine="python",         
    on_bad_lines="skip", 
    encoding="utf-8",
)

trainDF["ascii_name_clean"] = (
    trainDF["ascii_name_clean"]
    .astype(str)
    .str.lower()
    .str.replace(r"[\(\)\[\],\-]", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

In [3]:
print("Training Dataframe shape:", trainDF.shape)
display("Head of the Train Dataframe", trainDF.head())

Training Dataframe shape: (387828, 2)


'Head of the Train Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,3,r 3 hydroxybutanoyl n 2
1,7,s + 3 carene
2,7,1s 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene
3,7,+ 3 carene
4,7,1s 6r 3 7 7 trimethylbicyclo 4.1.0 hept 3 ene


In [4]:
testDF = pd.read_csv("data/testData.csv")

In [5]:
print("Training Dataframe shape:", testDF.shape)
display("Head of the Test Dataframe", testDF.head())

Training Dataframe shape: (25000, 2)


'Head of the Test Dataframe'

Unnamed: 0,compound_id,ascii_name_clean
0,7,1alpha 6alpha car 3 ene
1,20,1r 4s 2 2 dimethyl 3 methylenebicyclo 2.2.1 he...
2,31,+ menthone
3,38,+ o methylthalicberine
4,39,dextropimaricacid


In [None]:
vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=3,          
    max_df=0.9,
    max_features=200_000,  
    dtype=np.float32
)

X_train = vectorizer.fit_transform(trainDF["ascii_name_clean"])
X_test = vectorizer.transform(testDF["ascii_name_clean"])
X_train = normalize(X_train)
X_test = normalize(X_test)

In [None]:
svd = TruncatedSVD(
    n_components=256,  
    random_state=42
)

X_train_svd = svd.fit_transform(X_train)
X_test_svd  = svd.transform(X_test)
X_train_svd = normalize(X_train_svd)
X_test_svd  = normalize(X_test_svd)

In [8]:
dim = X_train_svd.shape[1]
index = faiss.IndexFlatIP(dim)

faiss.normalize_L2(X_train_svd)
faiss.normalize_L2(X_test_svd)

index.add(X_train_svd.astype("float32"))

In [9]:
def recall_at_k_faiss(
    index, X_test_dense,
    train_names, train_cids,
    test_names, test_cids,
    k=5, buffer=5
):
    hits, total = 0, 0

    D, I = index.search(X_test_dense.astype("float32"), k + buffer)

    for i in tqdm(range(len(test_names))):
        cid = test_cids[i]

        for j in I[i]:
            if train_names[j] == test_names[i]:
                continue
            if train_cids[j] == cid:
                hits += 1
                break

        total += 1

    return hits / total

In [None]:
def precision_at_k_and_mrr_faiss(
    index, X_test_dense,
    train_names, train_cids,
    test_names, test_cids,
    ks=(1, 5, 10),
    buffer=10
):
    precision_hits, reciprocal_ranks, max_k = {k: [] for k in ks}, [], max(ks)

    _, I = index.search(X_test_dense.astype("float32"), max_k + buffer)

    for i in tqdm(range(len(test_names)), desc="Precision/MRR"):
        true_cid, query_name = test_cids[i], test_names[i]

        filtered = []
        for j in I[i]:
            if train_names[j] == query_name:
                continue
            filtered.append(j)
            if len(filtered) >= max_k:
                break

        for k in ks:
            topk = filtered[:k]
            rel = sum(train_cids[j] == true_cid for j in topk)
            precision_hits[k].append(rel / k)

        rr = 0.0
        for rank, j in enumerate(filtered, start=1):
            if train_cids[j] == true_cid:
                rr = 1.0 / rank
                break
        reciprocal_ranks.append(rr)

    precision_at_k = {k: np.mean(v) for k, v in precision_hits.items()}
    mrr = np.mean(reciprocal_ranks)

    return precision_at_k, mrr


In [None]:
train_names = trainDF["ascii_name_clean"].values
train_cids  = trainDF["compound_id"].values
test_names  = testDF["ascii_name_clean"].values
test_cids   = testDF["compound_id"].values

recall_1 = recall_at_k_faiss(
    index, X_test_svd,
    train_names, train_cids,
    test_names, test_cids,
    k=1
)

recall_5 = recall_at_k_faiss(
    index, X_test_svd,
    train_names, train_cids,
    test_names, test_cids,
    k=5
)

recall_10 = recall_at_k_faiss(
    index, X_test_svd,
    train_names, train_cids,
    test_names, test_cids,
    k=10
)

print(f"Recall@1:  {recall_1:.4f}")
print(f"Recall@5:  {recall_5:.4f}")
print(f"Recall@10: {recall_10:.4f}")

In [None]:
train_names = trainDF["ascii_name_clean"].astype(str).tolist()
test_names  = testDF["ascii_name_clean"].astype(str).tolist()

precisions, mrr = precision_at_k_and_mrr_faiss(
    index, X_test_svd,
    train_names, train_cids,
    test_names, test_cids,
    ks=(1, 5, 10),
    buffer=10
)

print(f"Precision@1:  {precisions[1]:.4f}")
print(f"Precision@5:  {precisions[5]:.4f}")
print(f"Precision@10: {precisions[10]:.4f}")
print(f"MRR:          {mrr:.4f}")

Precision/MRR: 100%|██████████| 25000/25000 [00:01<00:00, 24353.27it/s]

Precision@1:  0.1843
Precision@5:  0.0866
Precision@10: 0.0539
MRR:          0.2330



