In [16]:
import logging
import pickle
import os
import numpy as np
from sklearn.svm import SVR
from MDRMF.models.modeller import Modeller
from MDRMF.dataset import Dataset

In [17]:
class SVRModeller(Modeller):

    def __init__(
        self, 
        dataset,
        evaluator=None, 
        iterations=10, 
        initial_sample_size=10, 
        acquisition_size=10, 
        acquisition_method="greedy", 
        retrain=True,
        seeds=[],
        kernel='rbf',
        C=1.0,
        epsilon=0.1,
        **kwargs) -> None:

        super().__init__(
            dataset, 
            evaluator,
            iterations, 
            initial_sample_size, 
            acquisition_size, 
            acquisition_method, 
            retrain,
            seeds
            )

        self.model = SVR(kernel=kernel, C=C, epsilon=epsilon, **kwargs)

    def fit(self):
        if self.seeds is None or len(self.seeds) == 0:
            initial_pts = self._initial_sampler(initial_sample_size=self.initial_sample_size)
        elif isinstance(self.seeds, (list, np.ndarray)) and all(isinstance(i, int) for i in self.seeds):
            self.seeds = list(self.seeds)  # Ensure seeds is a list
            initial_pts = self.dataset.get_points(self.seeds, remove_points=True)
        else:
            logging.error("Invalid seeds. Must be a list or ndarray of integers, or None.")
            return
        
        print(f"y values of starting points {initial_pts.y}")
        self.model.fit(initial_pts.X, initial_pts.y)        
        
        # First evaluation, using only the initial points
        if self.evaluator is not None:
            self.call_evaluator(i=-1, model_dataset=initial_pts) # -1 because ´call_evaluator´ starts at 1, and this iteration should be 0.

        for i in range(self.iterations):
            # Acquire new points
            acquired_pts = self._acquisition(self.model)

            # Merge old and new points
            if i == 0:
                model_dataset = self.dataset.merge_datasets([initial_pts, acquired_pts])
            else:
                model_dataset = self.dataset.merge_datasets([model_dataset, acquired_pts])

            if self.retrain:
                # Reset model and train
                self.model = SVR(kernel=self.model.kernel, C=self.model.C, epsilon=self.model.epsilon)
                self.model.fit(model_dataset.X, model_dataset.y)
            else:
                # Train on existing model
                self.model.fit(model_dataset.X, model_dataset.y)

            if self.evaluator is not None:
                self.call_evaluator(i=i, model_dataset=model_dataset)

        return self.model
    
    def predict(self, dataset: Dataset):
        if isinstance(dataset, Dataset):
            return self.model.predict(dataset.X)
        else:
            logging.error("Wrong object type. Must be of type `Dataset`")

    def save(self, filename: str):
        """
        Save the RFModeller to a pickle file
        """
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        try:
            with open(filename, "wb") as f:
                pickle.dump(self, f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.PicklingError as e:
            logging.error(f"Failed to pickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

    @staticmethod
    def load(filename: str):
        
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        # Check if file exists.
        if not os.path.isfile(filename):
            raise FileNotFoundError(f"No such file or directory: '{filename}'")
        
        try:
            with open(filename, "rb") as f:
                return pickle.load(f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.UnpicklingError as e:
            logging.error(f"Failed to unpickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

In [18]:
from MDRMF import MoleculeLoader, Featurizer, Evaluator
from MDRMF.models import RFModeller

data = MoleculeLoader('10K.csv', 'SMILES', 'r_i_docking_score').df

In [19]:
feat = Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=1024)

Computing features...
Progress: [------------------------------------------------->] 100% (9898/9898)
Feature computation completed.


In [20]:
X = features
y = data['r_i_docking_score']
ids = data['SMILES']

dataset = Dataset(X=X, y=y, ids=ids)
dataset

<Dataset X.shape: (9898, 1024), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [21]:
metrics = ['top-k']
k_values = ['100']
eval = Evaluator(dataset, metrics, k_values)

In [22]:
LR_model = SVRModeller(dataset=dataset, evaluator=eval, iterations=20)
LR_model.fit()

y values of starting points [-9.68028 -5.89022 -9.37531 -7.89715 -8.81536 -8.99042 -9.64678 -9.16665
 -8.86113 -7.64715]
Iteration 0, Results: {'top-100 model': 0.05}
Iteration 1, Results: {'top-100 model': 0.06}
Iteration 2, Results: {'top-100 model': 0.07}
Iteration 3, Results: {'top-100 model': 0.06}
Iteration 4, Results: {'top-100 model': 0.05}
Iteration 5, Results: {'top-100 model': 0.06}
Iteration 6, Results: {'top-100 model': 0.07}
Iteration 7, Results: {'top-100 model': 0.07}
Iteration 8, Results: {'top-100 model': 0.08}
Iteration 9, Results: {'top-100 model': 0.1}
Iteration 10, Results: {'top-100 model': 0.09}
Iteration 11, Results: {'top-100 model': 0.11}
Iteration 12, Results: {'top-100 model': 0.14}
Iteration 13, Results: {'top-100 model': 0.17}
Iteration 14, Results: {'top-100 model': 0.18}
Iteration 15, Results: {'top-100 model': 0.22}
Iteration 16, Results: {'top-100 model': 0.2}
Iteration 17, Results: {'top-100 model': 0.2}
Iteration 18, Results: {'top-100 model': 0.23}

In [23]:
RF_model = RFModeller(dataset=dataset, evaluator=eval, iterations=30)
RF_model.fit()

y values of starting points [ -8.91736  -7.80342  -5.92242  -8.83814 -10.892    -7.99502  -8.58359
  -8.32301  -8.19585  -7.3219 ]
Iteration 0, Results: {'top-100 model': 0.01}
Iteration 1, Results: {'top-100 model': 0.03}
Iteration 2, Results: {'top-100 model': 0.03}
Iteration 3, Results: {'top-100 model': 0.01}
Iteration 4, Results: {'top-100 model': 0.03}
Iteration 5, Results: {'top-100 model': 0.07}
Iteration 6, Results: {'top-100 model': 0.05}
Iteration 7, Results: {'top-100 model': 0.05}
Iteration 8, Results: {'top-100 model': 0.06}
Iteration 9, Results: {'top-100 model': 0.07}
Iteration 10, Results: {'top-100 model': 0.06}
Iteration 11, Results: {'top-100 model': 0.05}
Iteration 12, Results: {'top-100 model': 0.09}
Iteration 13, Results: {'top-100 model': 0.12}
Iteration 14, Results: {'top-100 model': 0.21}
Iteration 15, Results: {'top-100 model': 0.19}
Iteration 16, Results: {'top-100 model': 0.22}
Iteration 17, Results: {'top-100 model': 0.22}
Iteration 18, Results: {'top-100 