In [1]:
import MDRMF as mf
import numpy as np

In [2]:
data = mf.MoleculeLoader("10K.csv", "SMILES", "r_i_docking_score").df

In [3]:
feat = mf.Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=1024)

In [4]:
X = features
y = data['scores']
ids = data['SMILES']

dataset = mf.Dataset(X=X, y=y, ids=ids)
dataset

<Dataset X.shape: (9898, 1024), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [5]:
dataset.ids.shape

(9898,)

In [6]:
from MDRMF.dataset import Dataset

class Modeller:

    def __init__(self, dataset, iterations=10, initial_sample_size=10, acquisition_size=10, acquisition_method="greedy") -> None:
        
        self.dataset = dataset
        self.iterations = iterations
        self.initial_sample_size = initial_sample_size
        self.acquisition_size = acquisition_size
        self.acquisition_method = acquisition_method

    def initial_sampler(self):
        # Select random points in the dataset
        random_indices = np.random.choice(len(self.dataset.X), size=self.initial_sample_size, replace=False)

        # Select random points
        X_samples = self.dataset.X[random_indices]
        y_samples = self.dataset.y[random_indices]
        ids_samples = self.dataset.ids[random_indices]
        w_samples = self.dataset.w[random_indices]

        random_points = Dataset(X=X_samples, y=y_samples, ids=ids_samples, w=w_samples)

        # Delete selected points from dataset
        mask = np.ones(len(self.dataset.X), dtype=bool)
        mask[random_indices] = False
        self.dataset.X = self.dataset.X[mask]
        self.dataset.y = self.dataset.y[mask]
        self.dataset.ids = self.dataset.ids[mask]
        self.dataset.w = self.dataset.w[mask]

        return random_points

    def acquisition():
        pass
    
    def fit():
        pass

    def predict():
        pass

In [11]:
model = Modeller(dataset)
sample_dataset = model.initial_sampler()
sample_dataset

<Dataset X.shape: (10, 1024), y.shape: (10,), w.shape: (10,), ids: ['Cc1noc2ncnc(N[C@H](CO)c3ccccc3OC(F)(F)F)c12'
 'COCC1CCN(C(=O)c2ccc3ccc(OC)cc3c2)CC1'
 'CC1(C)OCC[C@@H]1NC(=O)NCc1ccc(C(F)F)cc1'
 'Cc1ccc([C@@H]2COCCN2C(=O)c2c[nH]c(=O)c3ccccc23)o1'
 'COc1cccc(CCC(=O)N2CCNC(=O)C[C@@H]2c2ccccc2)c1OC'
 'O=C(NCCc1cn2ccccc2n1)N[C@]12C[C@H]3C[C@H](C[C@H](C3)C1)C2'
 'O=C(CCC1CCCCCC1)N1CCC(c2cnco2)CC1'
 'O=C(Nc1[nH]ncc1-c1ccccc1)[C@H]1C[C@@H]2C=C[C@H]1CC2'
 'COc1c(C)cnc(Cn2nnc(C(C)(C)S(C)(=O)=O)n2)c1C'
 'COc1ccc(-c2nnc3n2CCCCCC3)c(OC)c1']>

In [12]:
model.dataset

<Dataset X.shape: (9868, 1024), y.shape: (9868,), w.shape: (9868,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [20]:
model = Modeller(dataset)
x, y, ids, w = model.initial_sampler()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [7]:
class RFModeller(Modeller):

    pass



In [8]:
model = RFModeller(dataset=dataset)

model.fit()

TypeError: fit() takes 0 positional arguments but 1 was given