In [1]:
import yaml
import os
import pandas as pd
import inspect
from typing import List
from MDRMF.evaluator import Evaluator
import MDRMF.models as mfm
from MDRMF import Dataset, MoleculeLoader, Featurizer, Model

class Experimenter:

    def __init__(self, config_file: str):
        self.config_file = config_file
        self.experiments = self._load_config()
    
    def _load_config(self) -> List[dict]:
        with open(self.config_file, 'r') as stream:
            try:
                config = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                return []

        # If there is only one experiment, make it into a list
        if isinstance(config, dict):
            config = [config]

        return [config]
    
    def conduct_all_experiments(self):
        for config in self.experiments:
            for experiment in config:
                key, value = list(experiment.items())[0]
                if key == 'Experiment':
                    self.conduct_experiment(value)
                if key == 'Dataset':
                    # Call self.make_dataset(value)
                    pass
                if key == 'Parallelize_experiments':
                    # add code here to handle 'Parallelize_experiments' cases
                    pass
    
    def conduct_experiment(self, exp_config: dict):

        # --- Data setup --- #
        # If there is a dataset use this
        if 'dataset' in exp_config:
            dataset_file = exp_config['dataset']
            dataset_model = Dataset.load(dataset_file)
        elif 'data' in exp_config:
            # Load data
            data_conf = exp_config['data']

            datafile = data_conf['datafile']
            SMILES = data_conf['SMILES_col']
            scores = data_conf['scores_col']
            ids = data_conf['ids_col']

            data = MoleculeLoader(datafile, SMILES, scores).df

            # Featurize
            feat = Featurizer(data)
            feat_config = exp_config['featurizer']

            feat_type = feat_config['name']
            feat_params = feat_config.copy()
            del feat_params['name']

            features = feat.featurize(feat_type, **feat_params)

            # Get data
            X = features
            y = data[scores]
            ids_data = data[ids]

            # Make datasets
            dataset_model = Dataset(X=X, y=y, ids=ids_data)

            # Save the dataset
            dataset_model.save("dataset_" + exp_config['name']+".pkl")

        # --- Directory setup --- #
        # Create main directory
        experiment_directory = os.path.join("Experiment_Root", exp_config['name'])
        os.makedirs(experiment_directory, exist_ok=True)

        # Save dataset
        dataset_file = os.path.join(experiment_directory, "dataset.pkl")
        dataset_model.save(dataset_file)

        # Create models directory
        models_directory = os.path.join(experiment_directory, "models")
        os.makedirs(models_directory, exist_ok=True)
        
        # --- Model setup --- #
        model_config = exp_config['model']
        model_name = model_config['name']
        model_params = model_config.copy()
        del model_params['name']

        # Check if model class exists
        model_class = None
        for name, obj in inspect.getmembers(mfm):
            if inspect.isclass(obj) and name == model_name:
                model_class = obj
                break

        if model_class is None:
            raise ValueError(f"Model {model_name} not found in MDRMF.models")

        # Setup evaluator
        model_metrics = exp_config['metrics']
        metrics = model_metrics['names']
        k_values = model_metrics['k']
        evaluator = Evaluator(dataset_model, metrics, k_values)

        results_list = []

        # --- Conduct replicate experiments and save results --- #
        for i in range(exp_config['replicate']):
            print(f"Running Experiment {exp_config['name']} replicate {i+1}")

            # Setup model
            model_input = model_class(dataset_model, evaluator=evaluator, **model_params)
            model = Model(model=model_input)
            model.train()
            
            # Save model
            model_file = os.path.join(models_directory, f"run{i+1}.pkl")
            # model.save(model_file)  # Replace with actual code to save model

            # Add results to list
            results = model.results
            for rank, score_dict in results.items():
                result_dict = {'replicate': i+1, 'rank': rank}
                result_dict.update(score_dict)
                results_list.append(result_dict)
            
        # Convert results to a DataFrame 
        results_df = pd.DataFrame(results_list)
        results_df.to_csv(os.path.join(experiment_directory, "results.csv"), index=False)


In [2]:
exp = Experimenter("test.yaml")

In [3]:
testme = exp._load_config()

In [8]:
testme[0][0]

{'Protocol_name': 'test protocol'}

In [5]:
testme

[[{'Protocol_name': 'test protocol'},
  {'Experiment': {'name': 'Exp01',
    'replicate': 5,
    'parallelize': False,
    'data': {'datafile': '10K.csv',
     'SMILES_col': 'SMILES',
     'scores_col': 'r_i_docking_score',
     'ids_col': 'SMILES'},
    'featurizer': {'name': 'morgan', 'radius': 2, 'nBits': 512},
    'model': {'name': 'RFModeller',
     'iterations': 3,
     'initial_sample_size': 30,
     'acquisition_size': 30,
     'acquisition_method': 'greedy'},
    'metrics': {'names': ['top-k'], 'k': [100]}}},
  {'Experiment': {'name': 'Exp02',
    'replicate': 5,
    'parallelize': False,
    'dataset': 'dataset.pkl',
    'model': {'name': 'RFModeller',
     'iterations': 3,
     'initial_sample_size': 20,
     'acquisition_size': 20,
     'acquisition_method': 'random'},
    'metrics': {'names': ['top-k', 'R2_k'], 'k': [100, 50]}}},
  {'Dataset': {'name': 'dataset01',
    'featurizer': {'name': 'morgan', 'nBits': 256, 'radius': 2}}}]]

In [6]:
#exp.conduct_all_experiments()

Running Experiment Exp01 replicate 1
Iteration 1, Results: {'top-100': 0.01}
Iteration 2, Results: {'top-100': 0.02}
Iteration 3, Results: {'top-100': 0.02}
Running Experiment Exp01 replicate 2
Iteration 1, Results: {'top-100': 0.04}
Iteration 2, Results: {'top-100': 0.05}
Iteration 3, Results: {'top-100': 0.1}
Running Experiment Exp01 replicate 3
Iteration 1, Results: {'top-100': 0.01}
Iteration 2, Results: {'top-100': 0.01}
Iteration 3, Results: {'top-100': 0.04}
Running Experiment Exp01 replicate 4
Iteration 1, Results: {'top-100': 0.08}
Iteration 2, Results: {'top-100': 0.07}
Iteration 3, Results: {'top-100': 0.13}
Running Experiment Exp01 replicate 5
Iteration 1, Results: {'top-100': 0.08}
Iteration 2, Results: {'top-100': 0.08}
Iteration 3, Results: {'top-100': 0.12}
Running Experiment Exp02 replicate 1
Iteration 1, Results: {'top-100': 0.04, 'top-50': 0.04, 'R2_k-100': 0.07112166208167714, 'R2_k-50': 0.022324590671346622}
Iteration 2, Results: {'top-100': 0.04, 'top-50': 0.0, 'R