In [1]:
import logging
import pickle
import os
import numpy as np
from sklearn.neural_network import MLPRegressor
from MDRMF.models.modeller import Modeller
from MDRMF.dataset import Dataset

class MLPModeller(Modeller):

    def __init__(
        self, 
        dataset,
        evaluator=None, 
        iterations=10, 
        initial_sample_size=10, 
        acquisition_size=10, 
        acquisition_method="greedy", 
        retrain=True,
        seeds=[],
        **kwargs) -> None:

        super().__init__(
            dataset, 
            evaluator,
            iterations, 
            initial_sample_size, 
            acquisition_size, 
            acquisition_method, 
            retrain,
            seeds
            )

        self.kwargs = kwargs
        self.model = MLPRegressor(**self.kwargs)

    def fit(self):
        if self.seeds is None or len(self.seeds) == 0:
            initial_pts = self._initial_sampler(initial_sample_size=self.initial_sample_size)
        elif isinstance(self.seeds, (list, np.ndarray)) and all(isinstance(i, int) for i in self.seeds):
            self.seeds = list(self.seeds)  # Ensure seeds is a list
            initial_pts = self.dataset.get_points(self.seeds, remove_points=True)
        else:
            logging.error("Invalid seeds. Must be a list or ndarray of integers, or None.")
            return
        
        print(f"y values of starting points {initial_pts.y}")
        self.model.fit(initial_pts.X, initial_pts.y)        
        
        # First evaluation, using only the initial points
        if self.evaluator is not None:
            self.call_evaluator(i=-1, model_dataset=initial_pts) # -1 because ´call_evaluator´ starts at 1, and this iteration should be 0.

        for i in range(self.iterations):
        # Acquire new points
            acquired_pts = self._acquisition(self.model)

            # Merge old and new points
            if i == 0:
                model_dataset = self.dataset.merge_datasets([initial_pts, acquired_pts])
            else:
                model_dataset = self.dataset.merge_datasets([model_dataset, acquired_pts])

            if self.retrain:
                # Reset model and train
                self.model = MLPRegressor(**self.kwargs)
                self.model.fit(model_dataset.X, model_dataset.y)
            else:
                # Train on existing model
                self.model.fit(model_dataset.X, model_dataset.y)

            if self.evaluator is not None:
                self.call_evaluator(i=i, model_dataset=model_dataset)

        return self.model
    
    def predict(self, dataset: Dataset):

        if isinstance(dataset, Dataset):
            return self.model.predict(dataset.X)
        else:
            logging.error("Wrong object type. Must be of type `Dataset`")

    def save(self, filename: str):
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        try:
            with open(filename, "wb") as f:
                pickle.dump(self, f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.PicklingError as e:
            logging.error(f"Failed to pickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

    @staticmethod
    def load(filename: str):
        
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        # Check if file exists.
        if not os.path.isfile(filename):
            raise FileNotFoundError(f"No such file or directory: '{filename}'")
        
        try:
            with open(filename, "rb") as f:
                return pickle.load(f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.UnpicklingError as e:
            logging.error(f"Failed to unpickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

In [2]:
from MDRMF import MoleculeLoader, Featurizer, Evaluator
from MDRMF.models import RFModeller

data = MoleculeLoader('10K.csv', 'SMILES', 'r_i_docking_score').df

In [3]:
feat = Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=1024)
#features = feat.featurize("rdkit2D")

Computing features...
Progress: [------------------------------------------------->] 100% (9898/9898)
Feature computation completed.


In [4]:
X = features
y = data['r_i_docking_score']
ids = data['SMILES']

dataset = Dataset(X=X, y=y, ids=ids)
dataset

<Dataset X.shape: (9898, 1024), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [5]:
metrics = ['top-k', 'top-k-acquired']
k_values = ['100']
eval = Evaluator(dataset, metrics, k_values)

In [18]:
NN_model = MLPModeller(dataset=dataset, evaluator=eval, iterations=10, learning_rate_init=0.01)
NN_model.fit()

y values of starting points [-8.41291 -7.60275 -8.92054 -9.82129 -7.24061 -8.53379 -7.17287 -7.40087
 -6.13147 -6.82179]
Iteration 0, Results: {'top-100 model': 0.05, 'top-100 acquired': 0.0}
Iteration 1, Results: {'top-100 model': 0.08, 'top-100 acquired': 0.01}
Iteration 2, Results: {'top-100 model': 0.12, 'top-100 acquired': 0.04}
Iteration 3, Results: {'top-100 model': 0.12, 'top-100 acquired': 0.04}
Iteration 4, Results: {'top-100 model': 0.15, 'top-100 acquired': 0.05}
Iteration 5, Results: {'top-100 model': 0.16, 'top-100 acquired': 0.09}
Iteration 6, Results: {'top-100 model': 0.21, 'top-100 acquired': 0.12}
Iteration 7, Results: {'top-100 model': 0.19, 'top-100 acquired': 0.16}
Iteration 8, Results: {'top-100 model': 0.22, 'top-100 acquired': 0.17}
Iteration 9, Results: {'top-100 model': 0.22, 'top-100 acquired': 0.2}
Iteration 10, Results: {'top-100 model': 0.22, 'top-100 acquired': 0.21}


In [8]:
RF_model = RFModeller(dataset=dataset, evaluator=eval, iterations=10)
RF_model.fit()

y values of starting points [ -9.75027  -5.71101  -6.00043  -7.78547  -6.83096  -7.56194  -9.23852
 -10.6733   -9.65127  -7.45861]
Iteration 0, Results: {'top-100 model': 0.08, 'top-100 acquired': 0.0}
Iteration 1, Results: {'top-100 model': 0.07, 'top-100 acquired': 0.0}
Iteration 2, Results: {'top-100 model': 0.12, 'top-100 acquired': 0.03}
Iteration 3, Results: {'top-100 model': 0.07, 'top-100 acquired': 0.05}
Iteration 4, Results: {'top-100 model': 0.13, 'top-100 acquired': 0.05}
Iteration 5, Results: {'top-100 model': 0.13, 'top-100 acquired': 0.06}
Iteration 6, Results: {'top-100 model': 0.17, 'top-100 acquired': 0.07}
Iteration 7, Results: {'top-100 model': 0.15, 'top-100 acquired': 0.07}
Iteration 8, Results: {'top-100 model': 0.13, 'top-100 acquired': 0.09}
Iteration 9, Results: {'top-100 model': 0.15, 'top-100 acquired': 0.09}
Iteration 10, Results: {'top-100 model': 0.12, 'top-100 acquired': 0.1}
