In [13]:
import logging
import pickle
import os
import io
import sys
from typing import Dict
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from MDRMF.models.modeller import Modeller
from MDRMF.dataset import Dataset

class RFModeller(Modeller):

    def __init__(
        self, 
        dataset,
        evaluator=None, 
        iterations=10, 
        initial_sample_size=10, 
        acquisition_size=10, 
        acquisition_method="greedy", 
        retrain=True,
        seeds=[],
        feature_importance_opt=None,
        **kwargs) -> None:

        super().__init__(
            dataset, 
            evaluator,
            iterations, 
            initial_sample_size, 
            acquisition_size, 
            acquisition_method, 
            retrain,
            seeds
            )

        self.kwargs = kwargs
        self.model = RandomForestRegressor(**self.kwargs)
        self.feature_importance_opt = feature_importance_opt

        if self.feature_importance_opt is not None:
            self.optimize_for_feature_importance(self.feature_importance_opt)

    def fit(self, iterations_in=None):

        # Seed handling
        if self.seeds is None or len(self.seeds) == 0:
            initial_pts = self._initial_sampler(initial_sample_size=self.initial_sample_size)
        elif isinstance(self.seeds, (list, np.ndarray)) and all(isinstance(i, int) for i in self.seeds):
            self.seeds = list(self.seeds)  # Ensure seeds is a list
            initial_pts = self.dataset.get_points(self.seeds, remove_points=True)
        else:
            logging.error("Invalid seeds. Must be a list or ndarray of integers, or None.")
            return

        print(f"y values of starting points {initial_pts.y}")
        self.model.fit(initial_pts.X, initial_pts.y)
        
        # First evaluation, using only the initial points
        if self.evaluator is not None:
            self.call_evaluator(i=-1, model_dataset=initial_pts) # -1 because ´call_evaluator´ starts at 1, and this iteration should be 0.

        # implemented to allow the ´fit´ method to be used internally in the class to support ´feature_importance_opt´.
        if iterations_in is None:
            iterations = self.iterations
        else:
            iterations = iterations_in

        for i in range(iterations):
        # Acquire new points
            acquired_pts = self._acquisition(self.model)

            # Merge old and new points
            if i == 0:
                model_dataset = self.dataset.merge_datasets([initial_pts, acquired_pts])
            else:
                model_dataset = self.dataset.merge_datasets([model_dataset, acquired_pts])

            if self.retrain:
                # Reset model and train
                self.model = RandomForestRegressor(**self.kwargs)
                self.model.fit(model_dataset.X, model_dataset.y)
            else:
                # Train on existing model
                self.model.fit(model_dataset.X, model_dataset.y)

            if self.evaluator is not None:
                self.call_evaluator(i=i, model_dataset=model_dataset)
            


        return self.model
    

    def predict(self, dataset: Dataset):

        if isinstance(dataset, Dataset):
            return self.model.predict(dataset.X)
        else:
            logging.error("Wrong object type. Must be of type `Dataset`")


    def save(self, filename: str):
        """
        Save the RFModeller to a pickle file
        """
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        try:
            with open(filename, "wb") as f:
                pickle.dump(self, f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.PicklingError as e:
            logging.error(f"Failed to pickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise


    @staticmethod
    def load(filename: str):
        
        # Check if filename is a string.
        if not isinstance(filename, str):
            raise ValueError("filename must be a string")
        
        # Check if file exists.
        if not os.path.isfile(filename):
            raise FileNotFoundError(f"No such file or directory: '{filename}'")
        
        try:
            with open(filename, "rb") as f:
                return pickle.load(f)
        except FileNotFoundError:
            logging.error(f"File not found: {filename}")
            raise
        except IOError as e:
            logging.error(f"IOError: {str(e)}")
            raise
        except pickle.UnpicklingError as e:
            logging.error(f"Failed to unpickle model: {str(e)}")
            raise
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
            raise


    def optimize_for_feature_importance(self, opt_parameters: Dict):

        print('Computing feature importance...')        

        iterations = opt_parameters['opt_iterations']
        features_limit = opt_parameters['opt_features_limit']

        # Suppress print statements
        original_stdout = sys.stdout
        sys.stdout = io.StringIO()
              
        model = self.fit(iterations_in=iterations)
        
        # Restore original stdout
        sys.stdout = original_stdout

        feature_importances = model.feature_importances_
        feature_importances_sorted = np.argsort(feature_importances)[:-1]
        important_features = feature_importances_sorted[-features_limit:]

        self.dataset.X = self.dataset.X[:, important_features]
        print(len(self.dataset.X[0])) # debug
        self.eval_dataset.X = self.eval_dataset.X[:, important_features]
        print(len(self.eval_dataset.X[0])) # debug

        print('Feature compution complete!')

        # important_feature_values = feature_importances[important_features]

        print(f"Indices of most important features: {important_features}")
        # print(f"values of most important features: {important_feature_values}")

        return important_features


        # --- Comments
        # This function is not yet implemented.
        # There should be an argument to RFModeller called ´feature_importance_opt´.
        # This argument should take a dict.
        # dict = {
        #   'opt_iterations': 50
        #   'opt_features_limit': 30
        # }
        # This list contain the number of times to train the optimization model and how many of the most important features to keep.
        # For each run the ´feature_importances_´ is calculated. The index values are counted. So for instance
        # 1. run index 55 is the most important feature
        # 2. run index 55 is the 10th most important feature
        # 3. run index 55 is the 5th most important feature
        # Now we just average how much 55 was used. (1+10+5)/3 = 5.33
        # We then just calculate an average for each index in the vector and sort them from lowest(best) to highest(worst).
        # In the case of the above dict we only keep the 30 most important features in the vector.
        # -----
        # Now that I think about it we might not even need to do this averaging, as the numbers are kind of already
        # averaged by merely running the model many times.
        # -----
        # Once the desired features have been found we need to set he dataset.X to the indexes that was found most important.
        # I think we can do this by just manipulating self.dataset, but I am a little unsure if this will disturb other parts
        # of the code. I don't think so, as we never return the Dataset at any time.

    def _print_progress_bar(self, iteration, total, bar_length=50, prefix="Progress"):
        """
        Print the progress bar.

        Args:
            iteration (int): current iteration.
            total (int): total iterations.
            bar_length (int): length of the progress bar.
            prefix (str): Prefix to print before the progress bar. Default is "Progress".
        """
        progress = (iteration / total)
        arrow = '-' * int(round(progress * bar_length) - 1) + '>'
        spaces = ' ' * (bar_length - len(arrow))

        # Format the progress bar string to include molecule count and total count
        print(f"\r{prefix}: [{arrow + spaces}] {int(progress * 100)}% ({iteration}/{total})", end='')        

In [2]:
from MDRMF import MoleculeLoader

data = MoleculeLoader('10K.csv', 'SMILES', 'r_i_docking_score').df
data = data.iloc[:-1]

In [3]:
from MDRMF import Featurizer
feat = Featurizer(data)
#features = feat.featurize("morgan", radius=2, nBits=1024)
features = feat.featurize("rdkit2D")

Computing features...
Progress: [------------------------------------------------->] 100% (9897/9897)
Feature computation completed.


In [4]:
X = features
y = data['r_i_docking_score']
ids = data['SMILES']

dataset = Dataset(X=X, y=y, ids=ids)

In [5]:
dataset.X

array([[13.29054352, -0.1670179 , 13.29054352, ...,  0.        ,
         0.        ,  2.        ],
       [13.22061492, -0.56435377, 13.22061492, ...,  0.        ,
         0.        ,  0.        ],
       [13.34317359, -0.13321653, 13.34317359, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [12.31352718, -3.86021098, 12.31352718, ...,  0.        ,
         0.        ,  0.        ],
       [12.98995418, -0.28175873, 12.98995418, ...,  0.        ,
         0.        ,  0.        ],
       [12.7552825 , -0.96831749, 12.7552825 , ...,  0.        ,
         0.        ,  0.        ]])

In [6]:
# unimportant_features = np.array([1, 4, 7, 9])

# for mol in dataset.X:

#     mol_important_features = np.delete(mol, unimportant_features)

#     print(mol_important_features, end='\n\n')

In [7]:
from MDRMF import Evaluator
metrics = ['top-k', 'top-k-acquired']
k_values = ['100']
eval = Evaluator(dataset, metrics, k_values)

In [14]:
model = RFModeller(dataset, eval, 10, feature_importance_opt={'opt_iterations': 5, 'opt_features_limit': 120})
model.fit()
#model.optimize_for_feature_importance({'opt_iterations': 5, 'opt_features_limit': 2})

208
Computing feature importance...
120
120

Feature computation complete!
Indices of most important features: [ 68 118  47  53  70 119 111 110  49  62 132  77   8 107 143 112  82 104
 157  54 133  34  92  35  90  79  75   5  89  88 103  15 102  43  31  40
 105 128  37  10 106  44 120  46 138 159  29 167 197   6  99  20  32 100
 113  14  39  24 116  28  33  66  16 117  18  23  27  41  81  25  13 153
  55   7  83 124  17  45  95  65  64  74  30  85  52  48  11  19  59  22
  73  60   3 122  12  91 101  57   1  98  96   4  93 115  21 150  36  69
  87  26   0  38  51  42  86  56   2  58  97 121]
y values of starting points [-8.32604 -7.94157 -8.8757  -9.2728  -8.33401 -9.51196 -6.95706 -4.67301
 -7.57862 -7.41391]
In eval: 120
Iteration 0, Results: {'top-100 model': 0.05, 'top-100 acquired': 0.0}
In eval: 120
Iteration 1, Results: {'top-100 model': 0.15, 'top-100 acquired': 0.01}
In eval: 120
Iteration 2, Results: {'top-100 model': 0.18, 'top-100 acquired': 0.03}
In eval: 120
Iteration 3, 

In [9]:
model = RFModeller(dataset, eval, 10)
model.fit()
#model.optimize_for_feature_importance({'opt_iterations': 5, 'opt_features_limit': 2})

208
y values of starting points [-8.13897 -7.39788 -5.86218 -7.29854 -9.07206 -9.4141  -9.68966 -5.58823
 -9.13174 -5.45263]
In eval: 208
Iteration 0, Results: {'top-100 model': 0.08, 'top-100 acquired': 0.0}
In eval: 208
Iteration 1, Results: {'top-100 model': 0.21, 'top-100 acquired': 0.03}
In eval: 208
Iteration 2, Results: {'top-100 model': 0.18, 'top-100 acquired': 0.04}
In eval: 208
Iteration 3, Results: {'top-100 model': 0.15, 'top-100 acquired': 0.04}
In eval: 208
Iteration 4, Results: {'top-100 model': 0.2, 'top-100 acquired': 0.07}
In eval: 208
Iteration 5, Results: {'top-100 model': 0.23, 'top-100 acquired': 0.1}
In eval: 208
Iteration 6, Results: {'top-100 model': 0.25, 'top-100 acquired': 0.13}
In eval: 208
Iteration 7, Results: {'top-100 model': 0.24, 'top-100 acquired': 0.15}
In eval: 208
Iteration 8, Results: {'top-100 model': 0.24, 'top-100 acquired': 0.15}
In eval: 208
Iteration 9, Results: {'top-100 model': 0.24, 'top-100 acquired': 0.18}
In eval: 208
Iteration 10, R

In [10]:
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors, MolFromSmiles

# Create RDKit molecule object from SMILES string
mol = MolFromSmiles('O=C(C)Oc1ccccc1C(=O)O')

# Check if the molecule is valid
if mol:
    rdkit2D_descriptors = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors.descList])
    descriptor_names = rdkit2D_descriptors.GetDescriptorNames()
    descriptor_values = rdkit2D_descriptors.CalcDescriptors(mol)
    
    for name, value in zip(descriptor_names, descriptor_values):
        print(f"{name}: {value}")
else:
    print("Invalid molecule")

MaxEStateIndex: 10.611948223733938
MinEStateIndex: -1.1140277777777776
MaxAbsEStateIndex: 10.611948223733938
MinAbsEStateIndex: 0.01601851851851821
qed: 0.5501217966938848
MolWt: 180.15899999999996
HeavyAtomMolWt: 172.09499999999997
ExactMolWt: 180.042258736
NumValenceElectrons: 68
NumRadicalElectrons: 0
MaxPartialCharge: 0.33900378687731025
MinPartialCharge: -0.4775395271554559
MaxAbsPartialCharge: 0.4775395271554559
MinAbsPartialCharge: 0.33900378687731025
FpDensityMorgan1: 1.3076923076923077
FpDensityMorgan2: 1.9230769230769231
FpDensityMorgan3: 2.4615384615384617
BCUT2D_MWHI: 16.53611122125436
BCUT2D_MWLOW: 10.182282381035359
BCUT2D_CHGHI: 2.104306980957853
BCUT2D_CHGLO: -2.0311320919470175
BCUT2D_LOGPHI: 2.169834556812799
BCUT2D_LOGPLOW: -2.063000061964294
BCUT2D_MRHI: 5.913065796110137
BCUT2D_MRLOW: -0.13141434244030484
BalabanJ: 3.0435273546341013
BertzCT: 343.2228677267164
Chi0: 9.844934982691242
Chi0n: 6.9813595436500515
Chi0v: 6.9813595436500515
Chi1: 6.109060905280622
Chi1n: