In [1]:
import os

import numpy as np
import pandas as pd

from pathlib import Path
from rdkit import Chem
from tqdm import tqdm

In [2]:
import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

## Mahine learning methods

Machine learning methods are chosen from different families for diversity.

In [3]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

In [4]:
ml_list = [(Ridge(), "RidgeRegression"), 
           (RandomForestRegressor(), "RandomForestRegressor"),
           (SVR(), "SVR"),
           (MLPRegressor(max_iter=500), "MLPRegressor"),
           (KNeighborsRegressor(), "KNeighborsRegressor")
           ]

## Chemical descriptors

Descriptors are chosen from different families (topological, physico-chemical, tranformer-based, etc.) for diversity.

More descriptors: https://molfeat.datamol.io/featurizers

In [5]:
from molfeat.calc import (FPCalculator,
                          RDKitDescriptors2D, 
                          Pharmacophore2D, 
                          MordredDescriptors, 
                          CATS, 
                          ScaffoldKeyCalculator)

In [6]:
descr_list = [
            # fingerprints
            (FPCalculator("atompair"), "AtomPairBinary"),
            (FPCalculator("atompair-count"), "AtomPairCount"),
            (FPCalculator("avalon"), "AvalonBinary"),
            (FPCalculator("ecfp"), "ECFPBinary"),
            (FPCalculator("ecfp-count"), "ECFPCount"),
            (FPCalculator("erg"), "ERG"),
            (FPCalculator("estate"), "Estate"),
            (FPCalculator("fcfp"), "FCFPBinary"),
            (FPCalculator("fcfp-count"), "FCFPCount"),
            (FPCalculator("layered"), "Layered"),
            (FPCalculator("maccs"), "MACCS"),
            (FPCalculator("pattern"), "Pattern"),
            (FPCalculator("rdkit"), "RDKitBinary"),
            (FPCalculator("rdkit-count"), "RDKitCount"),
            (FPCalculator("secfp"), "SECFP"),
            (FPCalculator("topological"), "TopologicalBinary"),
            (FPCalculator("topological-count"), "TopologicalCount"),

            # long
            # (RDKitDescriptors2D(replace_nan=True), "RDKitDescriptors2D"),
            # (Pharmacophore2D(replace_nan=True), "Pharmacophore2D"),
            # (MordredDescriptors(replace_nan=True), "MordredDescriptors"),
            # (ScaffoldKeyCalculator(), "ScaffoldKey"),
           ]

## Run benchmark

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [8]:
def parse_data(data_path):
    data = pd.read_csv(data_path, header=None)
    
    mol_prop_list = []
    for smi, prop in zip(data[0], data[1]):
        mol = Chem.MolFromSmiles(smi)
        mol_prop_list.append((mol, prop))

    return mol_prop_list

### Input/output data path

In [11]:
# input data
benchmark_collection =  Path("tmp_collection").resolve()

# output data
prediction_collection = Path("prediction_collection").resolve()

os.makedirs(prediction_collection, exist_ok=True)

### Model building

In [18]:
for bench_folder in os.listdir(benchmark_collection):
    
    # benchmark prediction save folder
    res_folder = os.path.join(prediction_collection, bench_folder)
    os.makedirs(res_folder, exist_ok=True)

    # benchmark dataset
    bench_folder = os.path.join(benchmark_collection, bench_folder)
    
    # run benchmark 
    for dataset in os.listdir(bench_folder)[:]:
        res = pd.DataFrame()
        #
        mols_train = parse_data(os.path.join(bench_folder, dataset, 'train.csv'))
        mols_test = parse_data(os.path.join(bench_folder, dataset, 'test.csv'))
        #
        res['Y_TRUE'] = [i[1] for i in mols_test]
    
        # calc 2D descriptors
        for descr_func, descr_name in descr_list:
    
            # calculate training data descriptors
            x_train, y_train = [], []
            for mol, prop in mols_train:
                if mol:
                    x_train.append(descr_func(mol))
                    y_train.append(prop)
            x_train = np.array(x_train)
            
            # scale training data descriptors
            scaler = MinMaxScaler()
            x_train_scaled = scaler.fit_transform(x_train)
            
            # train machine learning model
            for model, method_name in ml_list:
                model.fit(x_train_scaled, y_train)
                #
                y_pred = []
                for mol in [i[0] for i in mols_test]:
                    if mol:
                        x_test = descr_func(mol)
                        x_test_scaled = scaler.transform(x_test.reshape(1, -1))
                        y_pred.append(model.predict(x_test_scaled).item())
                    else:
                        y_pred.append(np.mean(y_train))
                #
                res[f'{descr_name}|{method_name}'] = y_pred
                res.to_csv(os.path.join(res_folder, f'{dataset}.csv'), index=False)