In [50]:
import numpy as np
from sklearn.metrics import r2_score
import copy


class Evaluator:
    def __init__(self, original_dataset, metrics, k_values):
        self.dataset = copy.deepcopy(original_dataset)
        self.metrics = metrics
        self.k_values = [int(k) for k in k_values]

    def evaluate(self, model, eval_dataset, model_dataset):
        results = {}
        for metric in self.metrics:
            if metric == "R2_model":
                results[metric] = self.r2_model(model, eval_dataset)
            else:
                for k in self.k_values:
                    if metric == "top-k":
                        results[f"top-{k} model"] = self.top_n_correct(k, model)
                    elif metric == "R2_k":
                        results[f"R2_k-{k}"] = self.r2_n(k, model)
                    elif metric == "top-k-acquired":
                        results[f"top-{k} acquired"] = self.top_n_in_model_set(k, model_dataset)
        return results

    def top_n_correct(self, n, model):
        model_predictions = model.predict(self.dataset) # Predict on the full dataset
        preds_indices = np.argsort(model_predictions)[:n] # Sort all predictions from lowest to highest and gets the indices of n amount of mols
        top_n_real_indices = np.argsort(self.dataset.y)[:n] # Get the indices of the n "real" mols and sorts them from lowest to highest
        return np.mean(np.isin(preds_indices, top_n_real_indices)) # np.isin calculates how many from the correct_preds_indices that are in top_n_real_indices and np.mean makes this a fraction
    
    def top_n_in_model_set(self, n, model_dataset):
        #print("n", n)
        print("model_dataset", model_dataset)
        lowest_y_indices = np.argsort(self.dataset.y)[:n]  # Get indices of the 'n' lowest y values.
        #print("lowest_y_indices len", len(lowest_y_indices))
        #print("lowest_y_indices", lowest_y_indices)
        lowest_y_ids = set(self.dataset.ids[lowest_y_indices])  # Retrieve corresponding IDs from the dataset and ensure uniqueness.
        #print("lowest_y_ids", lowest_y_ids)
        #print("lowest_y_ids len", len(lowest_y_ids))

        ids_acquired = set(model_dataset.ids)  # Retrieve unique ids from the internal model dataset.
        # print("ids_acquired:", ids_acquired)
        intersection_count = len(lowest_y_ids.intersection(ids_acquired))  # Count of common ids between lowest_y_ids and ids_acquired.
        
        return intersection_count / n  # Return the proportion of top 'n' found in the model_dataset.
    
    # def top_n_in_model_set(self, n, model_dataset):
    #     lowest_y_indices = np.argsort(self.dataset.y)[:n]  # Get indices of the 'n' lowest y values.
    #     lowest_y_ids = self.dataset.ids[lowest_y_indices]  # Retrieve corresponding IDs from the dataset.
    #     ids_acquired = model_dataset.ids  # Retrieve ids from the internal model dataset.
    #     mols_of_top_n_found = np.mean(np.isin(ids_acquired, lowest_y_ids))
    #     return mols_of_top_n_found

    

    def r2_model(self, model, model_dataset):
        '''
        Returns the R2 value of the internal model
        '''

        # Find missing points in the model_dataset
        training_points = self.dataset.missing_points(self.dataset, model_dataset)

        y_true = training_points.y
        y_pred = model.predict(training_points)

        return r2_score(y_true, y_pred)
    

    def r2_n(self, n, model):
        # Similar to top_n_correct but here we calculate the r2 score for the top n points
        model_predictions = model.predict(self.dataset)
        top_n_pred_indices = np.argsort(model_predictions)[:n]

        # Get top n points as a Dataset
        top_n_dataset = self.dataset.get_points(top_n_pred_indices)

        y_pred = model.predict(top_n_dataset)
        
        return r2_score(top_n_dataset.y, y_pred)

In [28]:
import MDRMF as mf

data = mf.MoleculeLoader(datafile="10K.csv", smi_col="SMILES", scores_col="r_i_docking_score").df
feat = mf.Featurizer(data)
features = feat.featurize("morgan", radius=2, nBits=512)

X = features
y = data['r_i_docking_score']
ids = data['SMILES']

dataset = mf.Dataset(X=X, y=y, ids=ids)
dataset

<Dataset X.shape: (9898, 512), y.shape: (9898,), w.shape: (9898,), ids: ['C[C@@H](NC(=O)N1C[C@H](c2ccccc2)[C@H]2COCC[C@H]21)c1ccc(NC(=O)NC2CC2)cc1'
 'O=C(Nc1cccc(C(=O)N2CCC(c3c[nH]c4ncccc34)CC2)c1)[C@@H]1Cc2ccccc2O1'
 'Cc1nn(-c2ccccc2)c2nc(C(=O)N3CCC([C@H]4C(=O)Nc5ccccc54)CC3)ccc12' ...
 'Cn1cccc(C(=O)N2CCN(C3CC3)c3ccc(Cl)cc32)c1=O'
 'O=C([O-])[C@H]1CC[C@@H](C(=O)N2CCCc3ccccc32)CC1'
 'CCNS(=O)(=O)c1cc(NCCOC)ccn1']>

In [61]:
metrics = ['top-k', 'top-k-acquired']
k_values = ['100']
eval = Evaluator(dataset, metrics, k_values)

from MDRMF import models
from MDRMF.models.rfmodeller import RFModeller

rf_model = RFModeller(
    dataset=dataset,
    evaluator=eval,
    iterations=3,
    initial_sample_size=2,
    acquisition_size=2,
    acquisition_method="greedy",
    n_estimators=50
)

model = mf.Model(model=rf_model)
model.train()

y values of starting points [-8.70208 -9.17715]
model_dataset <Dataset X.shape: (4, 512), y.shape: (4,), w.shape: (4,), ids: ['CC(=O)N1Cc2ccc(NS(=O)(=O)c3cc(F)cc(Cl)c3)cc2C1'
 'CN(c1ccccc1)C1CCN(C(=O)[C@@H]2CCNC(=O)C2)CC1'
 'O=C1C[C@@H](C(=O)N2CC[C@@H](Cc3ccc(F)cc3)C2)CCN1'
 'O=C1C[C@@H](C(=O)N2CC[C@H](Cc3ccc(F)cc3)C2)CCN1']>
Iteration 1, Results: {'top-100 model': 0.01, 'top-100 acquired': 0.0}
model_dataset <Dataset X.shape: (6, 512), y.shape: (6,), w.shape: (6,), ids: ['CC(=O)N1Cc2ccc(NS(=O)(=O)c3cc(F)cc(Cl)c3)cc2C1'
 'CN(c1ccccc1)C1CCN(C(=O)[C@@H]2CCNC(=O)C2)CC1'
 'O=C1C[C@@H](C(=O)N2CC[C@@H](Cc3ccc(F)cc3)C2)CCN1'
 'O=C1C[C@@H](C(=O)N2CC[C@H](Cc3ccc(F)cc3)C2)CCN1'
 'NC(=O)c1cccnc1N1CCN(C(=O)[C@@H]2CCNC(=O)C2)CC1'
 'O=C1C[C@H](C(=O)N2CC[C@H](Cc3ccc(F)cc3)C2)CCN1']>
Iteration 2, Results: {'top-100 model': 0.0, 'top-100 acquired': 0.0}
model_dataset <Dataset X.shape: (8, 512), y.shape: (8,), w.shape: (8,), ids: ['CC(=O)N1Cc2ccc(NS(=O)(=O)c3cc(F)cc(Cl)c3)cc2C1'
 'CN(c1ccccc1)C1CCN(C(=O