In [1]:
import MDRMF as mf
import numpy as np

In [2]:
data = mf.MoleculeLoader("10K.csv", "SMILES", "r_i_docking_score").df

In [3]:
feat = mf.Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=1024)

In [4]:
X = features
y = data['scores']
ids = data['SMILES']

dataset = mf.Dataset(X=X, y=y, ids=ids)
dataset

<Dataset X.shape: (9898, 1024), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [6]:
type(dataset.y)

numpy.ndarray

In [7]:
from MDRMF.dataset import Dataset

class Modeller:

    def __init__(self, dataset, iterations=10, initial_sample_size=10, acquisition_size=10, acquisition_method="greedy") -> None:
        
        self.dataset = dataset
        self.iterations = iterations
        self.initial_sample_size = initial_sample_size
        self.acquisition_size = acquisition_size
        self.acquisition_method = acquisition_method

    def _initial_sampler(self):
        # Select random points in the dataset
        random_indices = np.random.choice(len(self.dataset.X), size=self.initial_sample_size, replace=False)

        # Select random points
        X_samples = self.dataset.X[random_indices]
        y_samples = self.dataset.y[random_indices]
        ids_samples = self.dataset.ids[random_indices]
        w_samples = self.dataset.w[random_indices]

        random_points = Dataset(X=X_samples, y=y_samples, ids=ids_samples, w=w_samples)

        # Delete selected points from dataset
        mask = np.ones(len(self.dataset.X), dtype=bool)
        mask[random_indices] = False
        self.dataset.X = self.dataset.X[mask]
        self.dataset.y = self.dataset.y[mask]
        self.dataset.ids = self.dataset.ids[mask]
        self.dataset.w = self.dataset.w[mask]

        return random_points

    def _acquisition(self, model):
        if self.acquisition_method == "greedy":
            
            # Predict on the full dataset
            preds = model.predict(self.dataset.X)

            # Find indices of the x-number of smallest values
            indices = np.argpartition(preds, self.acquisition_size)[:self.acquisition_size]

            # Get the best docked molecules from the dataset
            acq_dataset = self.dataset.get_points(indices)

            # Remove these datapoints from the dataset
            self.dataset.remove_points(indices)


            return acq_dataset
    
    def fit():
        pass # Must be defined in child classes

    def predict():
        pass # Must be defined in child classes

In [8]:
# model = Modeller(dataset)
# sample_dataset = model._initial_sampler()
# sample_dataset

In [9]:
# model.dataset

In [10]:
from sklearn.ensemble import RandomForestRegressor

class RFModeller(Modeller):

    def __init__(self, dataset, iterations=10, initial_sample_size=10, acquisition_size=10, acquisition_method="greedy", **kwargs) -> None:
        super().__init__(dataset, iterations, initial_sample_size, acquisition_size, acquisition_method)
        self.kwargs = kwargs
        self.model = RandomForestRegressor(**self.kwargs)

    def fit(self):
        
        # Get random points
        random_pts = self._initial_sampler()

        # Fit initial model
        #for i in range(self.iterations):
            
        self.model.fit(random_pts.X, random_pts.y)

        for i in range(self.iterations):
        # Acquire new points
            acquired_pts = self._acquisition(self.model)

            # Merge old and new points
            if i == 0:
                model_dataset = self.dataset.merge_datasets([random_pts, acquired_pts])
            else:
                model_dataset = self.dataset.merge_datasets([model_dataset, acquired_pts])

            # Retrain with new model_dataset
            self.model = RandomForestRegressor(**self.kwargs)
            self.model.fit(model_dataset.X, model_dataset.y)


        return self.model #model_dataset
        

In [11]:
RF_mod = RFModeller(dataset)
RF_mod.fit()

In [12]:
RF_mod.dataset

<Dataset X.shape: (9788, 1024), y.shape: (9788,), w.shape: (9788,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [13]:
# m_model = RandomForestRegressor()

# m_model.fit(sample_dataset.X, sample_dataset.y)

In [14]:
# m_preds= m_model.predict(sample_dataset.X)