In [22]:
import numpy as np
from MDRMF.dataset import Dataset

class ModellerDev:
    """
    Base class to construct other models from
    
    Parameters:
        dataset (Dataset): The dataset object containing the data.
        evaluator (Evaluator): The evaluator object used to evaluate the model's performance.
        iterations (int): The number of iterations to perform.
        initial_sample_size (int): The number of initial samples to randomly select from the dataset.
        acquisition_size (int): The number of points to acquire in each iteration.
        acquisition_method (str): The acquisition method to use, either "greedy" or "random".
        retrain (bool): Flag indicating whether to retrain the model in each iteration.
    """
    def __init__(
            self, 
            dataset, 
            evaluator=None, 
            iterations=10, 
            initial_sample_size=10, 
            acquisition_size=10, 
            acquisition_method="greedy", 
            retrain=True,
            seeds=[]) -> None:
        """
        Initializes a Modeller object with the provided parameters.
        """        
        self.dataset = dataset.copy()
        self.evaluator = evaluator
        self.iterations = iterations
        self.initial_sample_size = initial_sample_size
        self.acquisition_size = acquisition_size
        self.acquisition_method = acquisition_method
        self.retrain = retrain
        self.seeds = seeds
        self.results = {}

    def _initial_sampler(self):
        """
        Randomly samples the initial points from the dataset.

        Returns:
            numpy.ndarray: Array of randomly selected points.
        """
        random_points = self.dataset.get_samples(self.initial_sample_size, remove_points=True)

        return random_points

    def _acquisition(self, model):
        """
        Performs the acquisition step to select new points for the model.

        Parameters:
            model: The model object used for acquisition.

        Returns:
            Dataset: The acquired dataset containing the selected points.
        """

        # Predict on the full dataset
        preds = model.predict(self.dataset.X)

        if self.acquisition_method == "greedy":

            # Find indices of the x-number of smallest values
            indices = np.argpartition(preds, self.acquisition_size)[:self.acquisition_size]

            # Get the best docked molecules from the dataset
            acq_dataset = self.dataset.get_points(indices)

            # Remove these datapoints from the dataset
            self.dataset.remove_points(indices)

        if self.acquisition_method == "random":
            
            # Get random points and delete from dataset
            acq_dataset = self.dataset.get_samples(self.acquisition_size, remove_points=True)

        return acq_dataset
    
    def fit(self):
        """
        Fits the model to the data.
        This method needs to be implemented in child classes.
        """        
        pass

    def predict():
        """
        Generates predictions using the fitted model.
        This method needs to be implemented in child classes.
        """        
        pass

    def save():
        """
        Save the model
        This method needs to be implemented in child classes.
        """         
        pass

    def load():
        """
        Load the model
        This method needs to be implemented in child classes.
        """ 
        pass
    
    def call_evaluator(self, i):
        """
        Calls the evaluator to evaluate the model's performance and stores the results.

        Parameters:
            i (int): The current iteration number.

        
        Notes: Should always be called when defining the fit() in a child model.
        """
        results = self.evaluator.evaluate(self, self.dataset)
        print(f"Iteration {i+1}, Results: {results}")

        # Store results
        self.results[i+1] = results

In [48]:
import logging
import pickle
import os
from sklearn.ensemble import RandomForestRegressor
from MDRMF.models.modeller import Modeller
from MDRMF.dataset import Dataset

class RFModellerDev(ModellerDev):

    def __init__(
        self, 
        dataset, 
        evaluator=None, 
        iterations=10, 
        initial_sample_size=10, 
        acquisition_size=10, 
        acquisition_method="greedy", 
        retrain=True,
        seeds=[],
        **kwargs) -> None:

        super().__init__(
            dataset, 
            evaluator, 
            iterations, 
            initial_sample_size, 
            acquisition_size, 
            acquisition_method, 
            retrain,
            seeds
            )

        self.kwargs = kwargs
        self.model = RandomForestRegressor(**self.kwargs)

    def fit(self):
        
        # Get random points
        if self.seeds == []:
            initial_pts = self._initial_sampler()
        
        # If freeze_sample is not empty and it's a list of integers use this as starting points
        elif self.seeds and isinstance(self.seeds, list) and all(isinstance(i, int) for i in self.seeds):

            # Get the seeded points and remember to remove them from the dataset
            initial_pts = self.dataset.get_points(self.seeds, remove_points=True)

        else:
            logging.error("Seeds failed. Seeds must be a list of integers like [5, 25, 600, 5000]")
        
        print(f"Using points {initial_pts.y} as starting point.")
        self.model.fit(initial_pts.X, initial_pts.y)

        for i in range(self.iterations):
        # Acquire new points
            acquired_pts = self._acquisition(self.model)

            # Merge old and new points
            if i == 0:
                model_dataset = self.dataset.merge_datasets([initial_pts, acquired_pts])
            else:
                model_dataset = self.dataset.merge_datasets([model_dataset, acquired_pts])

            if self.retrain:
                # Reset model and train
                self.model = RandomForestRegressor(**self.kwargs)
                self.model.fit(model_dataset.X, model_dataset.y)
            else:
                # Train on existing model
                self.model.fit(model_dataset.X, model_dataset.y)

            if self.evaluator is not None:
                self.call_evaluator(i=i)

        return self.model
    
    def predict(self, dataset: Dataset):

        if isinstance(dataset, Dataset):
            return self.model.predict(dataset.X)
        else:
            logging.error("Wrong object type. Must be of type `Dataset`")

    def save(self, filename: str):
        """
        Save the RFModeller to a pickle file
        """
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        try:
            with open(filename, "wb") as f:
                pickle.dump(self, f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.PicklingError as e:
            logging.error(f"Failed to pickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

    @staticmethod
    def load(filename: str):
        
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        # Check if file exists.
        if not os.path.isfile(filename):
            raise FileNotFoundError(f"No such file or directory: '{filename}'")
        
        try:
            with open(filename, "rb") as f:
                return pickle.load(f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.UnpicklingError as e:
            logging.error(f"Failed to unpickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise

In [49]:
from MDRMF.dataset import Dataset

In [50]:
dataset = Dataset.load("dataset.pkl")

In [51]:
dataset

<Dataset X.shape: (9898, 512), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [52]:
new_set = dataset.get_points([5, 8, 10, 100, 1000, 5000])

In [53]:
new_set.y

array([-12.3322 , -12.0921 , -11.9436 , -10.7807 ,  -9.53808,  -7.9548 ])

Jeg nåede at lave ModellerDev og RFModellerDev klar til at de kan acceptere en liste af prædefinerede punkter. Næste ting er at lave en RFModellerDev instance og se om jeg kan få den til at bruge en liste af punkter.

In [54]:
from MDRMF.evaluator import Evaluator
from MDRMF import Model
metrics = ['top-k']
k_values = ['100']
eval = Evaluator(dataset, metrics, k_values)

In [61]:
rf_model = RFModellerDev(
    dataset=dataset,
    evaluator=eval,
    #seeds=[5, 5000, 5550, 6014, 3]
    )

In [64]:
model = Model(model=rf_model)
model.train()

Using points [-5.14831 -8.50507 -8.7673  -8.13574 -6.44748 -5.6686  -7.95792 -5.963
 -3.51802 -4.85851] as starting point.
Iteration 1, Results: {'top-100': 0.02}
Iteration 2, Results: {'top-100': 0.05}
Iteration 3, Results: {'top-100': 0.03}
Iteration 4, Results: {'top-100': 0.02}
Iteration 5, Results: {'top-100': 0.03}
Iteration 6, Results: {'top-100': 0.04}
Iteration 7, Results: {'top-100': 0.03}
Iteration 8, Results: {'top-100': 0.04}
Iteration 9, Results: {'top-100': 0.04}
Iteration 10, Results: {'top-100': 0.03}
