In [None]:
import collections
import os
import json
import pickle
import numpy as np
import pandas as pd
import optuna

from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error)
from qptuna.three_step_opt_build_merge import (
    buildconfig_best,
    build_best,
)

## Algorithms, sets, endpoints, datadir

In [None]:
algs = ["RF", "SVR", "xgboost", "PLS"]
sets = ["1", "2", "3", "4"]
props = ["Clearance", "logD", "Solubility", "Permeability"]

In [None]:
datadir = "data"

## Helper functions

In [None]:
def get_train_stats(study_name):
    """Returns mean CV train scores"""
    
    file = f"../optuna-storage/optuna_storage_{study_name}.sqlite"
    storage = f"sqlite:///{os.path.abspath(file)}"
    
    try:
        loaded_study = optuna.load_study(
            study_name=study_name, 
            storage=storage
        )
    except Exception as e:
        print(f"Could not open study {study_name} from storage {storage}: {e}")
        return collections.defaultdict(lambda: np.nan)  # All scores NaN.
        
    train_scores = loaded_study.best_trial.user_attrs["train_scores"]

    r2 = np.mean(train_scores["train_r2"])
    rmse = np.sqrt(-1 * np.mean(train_scores["train_neg_mean_squared_error"]))
    mae = -1 * np.mean(train_scores["train_neg_mean_absolute_error"])
    return {"r2": r2, "rmse": rmse, "mae": mae }

In [None]:
def get_model(study_name):
    """Returns model loaded from a pickled file"""
    
    model_path = f"../best-models/best-{study_name}.pkl"
    if not os.path.isfile(model_path):
        print("Not there: ", model_path, "\n")
        return None

    with open(model_path, "rb") as f:
        model = pickle.load(f)
    return model

In [None]:
def get_test_stats(study_name, test_file):
    """Returns test scores and writes CSV with expected and predicted values"""
    
    model = get_model(study_name)
    
    if model is None:
        return collections.defaultdict(lambda: np.nan)
    
    df = pd.read_csv(test_file)
    expected = df["VALUE"].to_numpy().reshape(-1, 1)  # One-column matrix.
    predicted = model.predict_from_smiles(df["SMILES"])

    # R2.
    r2 = r2_score(y_true=expected, y_pred=predicted)
    # RMSE. sklearn 0.24 added squared=False to get RMSE, here we use np.sqrt().
    rmse = np.sqrt(mean_squared_error(y_true=expected, y_pred=predicted))  
    # MAE.
    mae = mean_absolute_error(y_true=expected, y_pred=predicted)
    
    # Write to file y_true and y_pred for scatter plot.
    pred_values_path = f"../pred_values/{study_name}_testset_values.csv"
    dict_values = {"SMILES": list(df["SMILES"]), "y_true": list(df["VALUE"]), "y_pred": list(predicted)}
    df_values = pd.DataFrame.from_dict(dict_values, dtype=None, columns=None)
    df_values.to_csv(pred_values_path, index=False)

    return {"r2": r2, "rmse": rmse, "mae": mae, "number_of_mols": len(df)}

In [None]:
def get_stats_df(datadir, prop, setid, alg):
    """Returns one-row DataFrame with statistics for the paper"""
    
    study_name = f"MMP_{datadir}_{prop}_set{setid}_{alg}"
    train_file = f"../{datadir}/{prop}_set{setid}_train.csv"
    train_number_of_mols = len(pd.read_csv(train_file))
    test_file = f"../{datadir}/{prop}_set{setid}_test.csv"
    
    train = get_train_stats(study_name)
    test = get_test_stats(study_name, test_file)
    
    row_data = {
        "endpoint": prop,
        "set": setid,
        "n_all": np.nan,
        "n_train": train_number_of_mols,
        "n_test": test["number_of_mols"],
        "model": alg,
        "train_r2": train["r2"],
        "train_rmse": train["rmse"],
        "train_mae": train["mae"],
        "test_r2": test["r2"],
        "test_rmse": test["rmse"],
        "test_mae": test["mae"],
    }
    df_stats = pd.DataFrame(row_data, index=[0])
    return df_stats

## Main loop to calculate all scores

The loop comes in two versions: sequential and parallel. Choose one.

In [None]:
# Sequential version.
df_all = pd.DataFrame()
for prop in props:
    for setid in sets:
        for alg in algs:
            print(f"Prop: {prop}, set: {setid}, alg: {alg}")
            df = get_stats_df(datadir, prop, setid, alg)
            df_all = df_all.append(df)

df_all.reset_index()

In [None]:
# Parallel version.
import joblib
from joblib import Parallel, delayed
dfs = Parallel(n_jobs=7)(
    delayed(get_stats_df)(datadir, prop, setid, alg) 
    for prop in props
    for setid in sets
    for alg in algs
)
df_all = pd.concat(dfs)
df_all.reset_index()

In [None]:
df_all.to_csv("../mmp_filled.csv", index=False)

## Additional computation of test scores for downsampled 10% data

These results will be used to "fill gaps" in the main table.

In [None]:
datadir_ds = "downsampled-10-percent"

In [None]:
dfs_ds = Parallel(n_jobs=7)(
    delayed(get_stats_df)(datadir_ds, prop, setid, alg) 
    for prop in props
    for setid in sets
    for alg in algs
)
df_ds = pd.concat(dfs_ds)
df_ds.reset_index()
df_ds.to_csv("../mmp_filled_ds.csv", index=False)