# Benchmarking Classical Machine Learning Model on AptaBench
This notebook benchmarks **traditional ML model** (LightGBM) on the AptaBench dataset, focusing on feature engineering, cross-validation, and baseline predictive performance.

## 1. Imports & Config

In [1]:
!pip install huggingface_hub[hf_xet]



In [2]:
import torch
torch.__version__

'2.6.0+cu124'

In [3]:
# Add project root to sys.path so "src" is importable
import sys
from pathlib import Path

root = Path.cwd()
while not (root / "src").exists() and root.parent != root:
    root = root.parent

sys.path.insert(0, str(root))

In [4]:
# === Imports ===
import os
import pandas as pd
from src.models.screening import screen_lgbm_optuna, load_splits
# --- Aptamer encoders ---
from src.encoders.aptamer_encoders import (
    onehot_with_type_bit,
    kmer_freq_with_type_bit,
    gena_embed,
    dnabert2_embed
)

# --- Molecule encoders ---
from src.encoders.molecule_encoders import (
    morgan_fp,
    maccs_fp,
    physchem_descriptors,
    chemberta_embed,
    unimol_embed,
    molformer_embed
)

# Paths
DATASET_PATH = "../dataset/AptaBench_dataset.csv" 
SPLITS_DIR = "../dataset/splits"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load dataset
df = pd.read_csv(DATASET_PATH)
print("Dataset loaded:", df.shape)
display(df.head())

# Check available splits
print("Available splits:", os.listdir(SPLITS_DIR))

Dataset loaded: (2001, 8)


Unnamed: 0,type,sequence,canonical_smiles,pKd_value,label,buffer,origin,source
0,DNA,GGGAGAATTCCCGCGGCAGAAGCCCACCTGGCTTTGAACTCTATGT...,Nc1c(S(=O)(=O)O)cc(Nc2ccc(Nc3nc(Cl)nc(Nc4ccccc...,4.0,0,,,RSAPred
1,RNA,GGGAGAAUUCCCGCGGCGUUGGCCCAGGAUAAUAGGACGAAAUCCG...,Nc1c(S(=O)(=O)O)cc(Nc2ccc(S(=O)(=O)O)c(Nc3nc(C...,3.221849,0,,,RSAPred
2,RNA,GGGAAGGGAAGAAACUGCGGCUUCGGCCGGCUUCCC,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,5.39794,1,,,RSAPred
3,RNA,GGGAAGGGAAGAAACUGCGGCUUCGGCCGGCUUCCC,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,8.301026,1,,,RSAPred
4,RNA,GGCGUGUAGGAUAUGCUUCGGCAGAAGGACACGCC,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(CC(O)C(O)C(O)COP(...,4.638272,1,,,RSAPred


Available splits: ['disjoint_aptamer.json', 'disjoint_molecule.json', 'stratified.json']


## 2. Encoder configurations

In [6]:
apt_cfgs = [
    {"name": "OneHot", "func": onehot_with_type_bit},
    {"name": "Kmer3", "func": kmer_freq_with_type_bit, "kwargs": {"k": 3}},
    {"name": "Kmer4", "func": kmer_freq_with_type_bit, "kwargs": {"k": 4}},
    {"name": "GENA", "func": gena_embed},
    {"name": "DNABERT2", "func": dnabert2_embed},
]

mol_cfgs = [
    {"name": "MorganFP", "func": morgan_fp},
    {"name": "MACCS", "func": maccs_fp},
    {"name": "PhysChem", "func": physchem_descriptors},
    {"name": "ChemBERTa", "func": chemberta_embed},
    {"name": "UniMol", "func": unimol_embed},
    {"name": "MolFormer", "func": molformer_embed},
]

## 3. 

In [7]:
results_df = screen_lgbm_optuna(
    df,
    apt_cfgs,
    mol_cfgs,
    split_modes=("stratified",),
    n_trials=20,
    metric="mcc",
    splits_dir="../dataset/splits"
)


The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-09-25 05:56:56 | unimol_tools\models\unimol.py | 167 | INFO | Uni-Mol Tools | Loading pretrained weights from c:\Users\m19er\AptaBench\.venv\lib\site-packages\unimol_tools\weights\mol_pre_all_h_220816.pt
2025-09-25 05:57:00 | unimol_tools\data\conformer.py | 182 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 2001/2001 [19:48<00:00,  1.68it/s]  
2025-09-25 06:16:49 | unimol_tools\data\conformer.py | 197 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 98.40% of molecules.
2025-09-25 06:16:49 | unimol_tools\data\conformer.

In [9]:
results_df

Unnamed: 0,split,aptamer_encoder,molecule_encoder,ROC-AUC mean,ROC-AUC std,MCC mean,MCC std,best_params
0,stratified,OneHot,MorganFP,0.899689,0.0078,0.562367,0.081698,"{'n_estimators': 517, 'learning_rate': 0.14086..."
1,stratified,OneHot,MACCS,0.886771,0.022856,0.546582,0.130153,"{'n_estimators': 203, 'learning_rate': 0.06728..."
2,stratified,OneHot,PhysChem,0.898136,0.027495,0.581691,0.051765,"{'n_estimators': 989, 'learning_rate': 0.08373..."
3,stratified,OneHot,ChemBERTa,0.904229,0.027178,0.500618,0.035603,"{'n_estimators': 280, 'learning_rate': 0.01226..."
4,stratified,OneHot,UniMol,0.88725,0.026944,0.520593,0.08171,"{'n_estimators': 391, 'learning_rate': 0.09670..."
5,stratified,OneHot,MolFormer,0.902276,0.026715,0.566682,0.029224,"{'n_estimators': 388, 'learning_rate': 0.16154..."
6,stratified,Kmer3,MorganFP,0.883951,0.01695,0.562293,0.101515,"{'n_estimators': 947, 'learning_rate': 0.13600..."
7,stratified,Kmer3,MACCS,0.882725,0.024857,0.524364,0.110421,"{'n_estimators': 698, 'learning_rate': 0.09269..."
8,stratified,Kmer3,PhysChem,0.913853,0.00983,0.603771,0.050112,"{'n_estimators': 339, 'learning_rate': 0.08436..."
9,stratified,Kmer3,ChemBERTa,0.911728,0.01941,0.564449,0.057261,"{'n_estimators': 797, 'learning_rate': 0.19459..."


In [None]:
pivot_df = results_df.pivot(
    index="aptamer_encoder", columns="molecule_encoder", values="MCC mean"
)

In [13]:
pivot_df

molecule_encoder,ChemBERTa,MACCS,MolFormer,MorganFP,PhysChem,UniMol
aptamer_encoder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DNABERT2,0.550144,0.421883,0.591592,0.507712,0.486427,0.517372
GENA,0.507226,0.464385,0.602932,0.504775,0.543626,0.506365
Kmer3,0.564449,0.524364,0.605067,0.562293,0.603771,0.561703
Kmer4,0.546185,0.550691,0.614702,0.576576,0.6144,0.562158
OneHot,0.500618,0.546582,0.566682,0.562367,0.581691,0.520593


In [15]:
import seaborn as sns