In [1]:
import MDRMF as mf
from MDRMF import Model
from MDRMF.models import RFModeller
from matplotlib import pyplot as plt

In [2]:
data = mf.MoleculeLoader(datafile="10K.csv", smi_col="SMILES", scores_col="r_i_docking_score").df
feat = mf.Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=256)

In [3]:
X = features
y = data['r_i_docking_score']
ids = data['SMILES']

dataset_model = mf.Dataset(X=X, y=y, ids=ids)

In [4]:
dataset_model

<Dataset X.shape: (9898, 256), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [5]:
metrics = ['top-k']
k_values = ['100']
eval = mf.Evaluator(dataset_model, metrics, k_values)

In [6]:
dataset_model = mf.Dataset(X=X, y=y, ids=ids)
rf_model = RFModeller(
    dataset=dataset_model,
    evaluator=eval,
    iterations=60,
    initial_sample_size=30,
    acquisition_size=30,
    acquisition_method="greedy",
    n_estimators=50
)

In [7]:
model = Model(model=rf_model)
model.train()

Iteration 1, Results: {'top-100': 0.06}
Iteration 2, Results: {'top-100': 0.06}
Iteration 3, Results: {'top-100': 0.07}
Iteration 4, Results: {'top-100': 0.1}
Iteration 5, Results: {'top-100': 0.1}
Iteration 6, Results: {'top-100': 0.12}
Iteration 7, Results: {'top-100': 0.13}
Iteration 8, Results: {'top-100': 0.13}
Iteration 9, Results: {'top-100': 0.14}
Iteration 10, Results: {'top-100': 0.14}
Iteration 11, Results: {'top-100': 0.14}
Iteration 12, Results: {'top-100': 0.16}
Iteration 13, Results: {'top-100': 0.16}
Iteration 14, Results: {'top-100': 0.16}
Iteration 15, Results: {'top-100': 0.16}
Iteration 16, Results: {'top-100': 0.18}
Iteration 17, Results: {'top-100': 0.2}
Iteration 18, Results: {'top-100': 0.21}
Iteration 19, Results: {'top-100': 0.21}
Iteration 20, Results: {'top-100': 0.23}
Iteration 21, Results: {'top-100': 0.23}
Iteration 22, Results: {'top-100': 0.25}
Iteration 23, Results: {'top-100': 0.26}
Iteration 24, Results: {'top-100': 0.29}
Iteration 25, Results: {'top

In [8]:
dataset_model

<Dataset X.shape: (9898, 256), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [9]:
model.model.dataset

<Dataset X.shape: (8068, 256), y.shape: (8068,), w.shape: (8068,), ids: ['O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'CS(=O)(=O)c1ccc(F)c(C(=O)Nc2ccc(-c3nc(-c4ccccc4)n[nH]3)cc2)c1'
 'NC(=O)[C@H]1CCCN(c2ccc(C(=O)N3CCC(c4cc5ccccc5[nH]4)CC3)cc2)C1' ...
 'Cc1cnn(C)c1S(=O)(=O)NC[C@@H](Cc1ccccn1)C(=O)[O-]'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>