# Benchmarking Classical Machine Learning Model on AptaBench
This notebook benchmarks **traditional ML model** (LightGBM) on the AptaBench dataset, focusing on feature engineering, cross-validation, and baseline predictive performance.

## 1. Imports & Config

In [1]:
import torch
torch.__version__

'2.6.0+cu124'

In [2]:
# Add project root to sys.path so "src" is importable
import sys
from pathlib import Path

root = Path.cwd()
while not (root / "src").exists() and root.parent != root:
    root = root.parent

sys.path.insert(0, str(root))

In [3]:
# === Imports ===
import os
import pandas as pd
from src.models.screening import screen_lgbm_optuna, load_splits
# --- Aptamer encoders ---
from src.encoders.aptamer_encoders import (
    onehot_with_type_bit,
    kmer_freq_with_type_bit,
    gena_embed,
    dnabert2_embed
)

# --- Molecule encoders ---
from src.encoders.molecule_encoders import (
    morgan_fp,
    maccs_fp,
    physchem_descriptors,
    chemberta_embed,
    unimol_embed,
    molformer_embed
)

# Paths
DATASET_PATH = "../dataset/AptaBench_dataset_v2.csv" 
SPLITS_DIR = "../dataset/splits"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load dataset
df = pd.read_csv(DATASET_PATH)
print("Dataset loaded:", df.shape)
display(df.head())

# Check available splits
print("Available splits:", os.listdir(SPLITS_DIR))

Dataset loaded: (6413, 8)


Unnamed: 0,type,sequence,canonical_smiles,pKd_value,label,buffer,origin,source
0,DNA,GGGAGAATTCCCGCGGCAGAAGCCCACCTGGCTTTGAACTCTATGT...,Nc1c(S(=O)(=O)O)cc(Nc2ccc(Nc3nc(Cl)nc(Nc4ccccc...,4.0,0,,,RSAPred
1,RNA,GGGAGAAUUCCCGCGGCGUUGGCCCAGGAUAAUAGGACGAAAUCCG...,Nc1c(S(=O)(=O)O)cc(Nc2ccc(S(=O)(=O)O)c(Nc3nc(C...,3.221849,0,,,RSAPred
2,RNA,GGGAAGGGAAGAAACUGCGGCUUCGGCCGGCUUCCC,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)OP(=O)(O)OP(=O)(O...,5.39794,1,,,RSAPred
3,RNA,GGGAAGGGAAGAAACUGCGGCUUCGGCCGGCUUCCC,Nc1ncnc2c1ncn2C1OC(COP(=O)(O)O)C(O)C1O,8.301026,1,,,RSAPred
4,RNA,GGCGUGUAGGAUAUGCUUCGGCAGAAGGACACGCC,Cc1cc2nc3c(=O)[nH]c(=O)nc-3n(CC(O)C(O)C(O)COP(...,4.638272,1,,,RSAPred


Available splits: ['disjoint_aptamer.json', 'disjoint_molecule.json', 'stratified.json']


## 2. Encoder configurations

In [6]:
apt_cfgs = [
    {"name": "OneHot", "func": onehot_with_type_bit},
    {"name": "Kmer3", "func": kmer_freq_with_type_bit, "kwargs": {"k": 3}},
    {"name": "Kmer4", "func": kmer_freq_with_type_bit, "kwargs": {"k": 4}},
    {"name": "GENA", "func": gena_embed},
    {"name": "DNABERT2", "func": dnabert2_embed},
]

mol_cfgs = [
    {"name": "MorganFP", "func": morgan_fp},
    {"name": "MACCS", "func": maccs_fp},
    {"name": "PhysChem", "func": physchem_descriptors},
    {"name": "ChemBERTa", "func": chemberta_embed},
    {"name": "UniMol", "func": unimol_embed},
    {"name": "MolFormer", "func": molformer_embed},
]

## 3. 

In [8]:
results_df = screen_lgbm_optuna(
    df,
    apt_cfgs,
    mol_cfgs,
    split_modes=("stratified", "disjoint_molecule", "disjoint_aptamer"),
    n_trials=20,
    metric="mcc",
    splits_dir="../dataset/splits"
)


Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-25 17:52:16 | unimol_tools\models\unimol.py | 167 | INFO | Uni-Mol Tools | Loading pretrained weights from c:\Users\m19er\AptaBench\.venv\lib\site-packages\unimol_tools\weights\mol_pre_all_h_220816.pt
2025-11-25 17:52:21 | unimol_tools\data\conformer.py | 182 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 6413/6413 [21:07<00:00,  5.06it/s]   
2025-11-25 18:13:29 | unimol_tools\data\conformer.py | 197 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 99.50% of molecules.
2025-11-25 18:13:29 | unimol_tools\data\conformer.py | 206 | INFO | Uni-Mol Tools | Failed conformers indices: [1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 14

In [10]:
results_df.to_csv('lgbm_results_screening.csv')

In [None]:
pivot_df = results_df.pivot(
    index="aptamer_encoder", columns="molecule_encoder", values="MCC mean"
)

In [13]:
pivot_df

molecule_encoder,ChemBERTa,MACCS,MolFormer,MorganFP,PhysChem,UniMol
aptamer_encoder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DNABERT2,0.550144,0.421883,0.591592,0.507712,0.486427,0.517372
GENA,0.507226,0.464385,0.602932,0.504775,0.543626,0.506365
Kmer3,0.564449,0.524364,0.605067,0.562293,0.603771,0.561703
Kmer4,0.546185,0.550691,0.614702,0.576576,0.6144,0.562158
OneHot,0.500618,0.546582,0.566682,0.562367,0.581691,0.520593


In [15]:
import seaborn as sns