# Import Libraries

In [1]:
# Import libraries
%matplotlib inline
import json
import os
import sys
import yaml
from typing import List, Tuple

import datamol as dm
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import cm
from rdkit import Chem
from rdkit.Chem import Draw, rdFMCS
from rdkit.Chem.Draw import IPythonConsole

repo_path = os.path.dirname(os.path.abspath(""))
CHECKOUT_PATH = repo_path
DATASET_PATH = os.path.join(repo_path, "datasets")

os.chdir(CHECKOUT_PATH)
sys.path.insert(0, CHECKOUT_PATH)

# Import alinemol 
from alinemol.preprocessing import drop_duplicates, standardize_smiles
from alinemol.splitters.splits import MolecularLogPSplit
from alinemol.utils.split_utils import (EmpiricalKernelMapTransformer,
                                      convert_to_default_feats_if_smiles,
                                      get_scaffold)



Draw.SetComicMode(IPythonConsole.drawOptions)

In [2]:
# Load the configuration file (wich contains datasets, models, and splitting)
CFG = yaml.safe_load(open(os.path.join(DATASET_PATH, "config.yml"), "r"))

ML_MODELS: List = CFG["models"]["ML"]
SCRATCH_GNN_MODELS: List = CFG["models"]["GNN"]["scratch"]
PRETRAINED_GNN_MODELS: List = CFG["models"]["GNN"]["pretrained"]
GNN_MODELS: List = SCRATCH_GNN_MODELS + PRETRAINED_GNN_MODELS
ALL_MODELS: List[List] = [ML_MODELS, SCRATCH_GNN_MODELS, PRETRAINED_GNN_MODELS]
DATASET_NAMES: List = CFG["datasets"]["TDC"]
SPLIT_TYPES: List = CFG["splitting"]   

# read the results that are saved in the results folder. This is used for the visualization
results = pd.read_csv(os.path.join("classification_results", "TDC", "results.csv")) 
results["model_type"] = results['model'].apply(lambda x: "Classical_ML" if x in ML_MODELS else "GNN")
metric_mapping = {'accuracy': 'Accuracy', 'roc_auc': 'ROC-AUC', 'pr_auc': 'PR-AUC'}


# Performance GAP between ID and OOD (ALL Models)

In [11]:
models = [item for sublist in ALL_MODELS for item in sublist]  # get the unique models
metric = "roc_auc"
vmin, vnmax = 0.0, 0.2

f_mean_id ={}
f_std_id = {}
f_mean_ood = {}
f_std_ood = {}
f_diff_mean = {}
f_diff_std = {}
for i, dataset in enumerate(DATASET_NAMES):
    result_subset = results[results["dataset"]==dataset]
    mean_df_id = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    std_df_id = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    mean_df_ood = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    std_df_ood = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    diff_mean = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    diff_std = pd.DataFrame(index=models, columns=SPLIT_TYPES)

    for model in models:
        for split in SPLIT_TYPES:
            mean_id = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"].mean()
            mean_ood = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"].mean()
            std_id = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"].std()
            std_ood = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"].std()
            diff = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"] - result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"]
            mean_df_id.loc[model, split] = mean_id
            std_df_id.loc[model, split] = std_id
            mean_df_ood.loc[model, split] = mean_ood
            std_df_ood.loc[model, split] = std_ood
            diff_mean.loc[model, split] = diff.mean()
            diff_std.loc[model, split] = diff.std()
    
    # just for the models index, devide them into ML, GNN and PRETRAINED_GNN and calculate average performance for each

    mean_df_id = mean_df_id.mean()
    mean_df_id = mean_df_id.astype(float)
    f_mean_id[dataset] = mean_df_id.round(2)

    std_df_id = std_df_id.mean()
    std_df_id = std_df_id.astype(float)
    f_std_id[dataset] = std_df_id.round(2)

    mean_df_ood = mean_df_ood.mean()
    mean_df_ood = mean_df_ood.astype(float)
    f_mean_ood[dataset] = mean_df_ood.round(2)

    std_df_ood = std_df_ood.mean()
    std_df_ood = std_df_ood.astype(float)
    f_std_ood[dataset] = std_df_ood.round(2)

    diff_mean = diff_mean.mean()
    diff_mean = diff_mean.astype(float)
    f_diff_mean[dataset] = diff_mean.round(2)

    diff_std = diff_std.mean()
    diff_std = diff_std.astype(float)
    f_diff_std[dataset] = diff_std.round(2)

In [13]:
# for f, put split on the rows and dataset on the columns and create new dataframe
df_mean_id = pd.DataFrame()
df_std_id = pd.DataFrame()
df_mean_ood = pd.DataFrame()
df_std_ood = pd.DataFrame()
df_diff_mean = pd.DataFrame()
df_diff_std = pd.DataFrame()


for dataset in DATASET_NAMES:
    df_mean_id[dataset] = f_mean_id[dataset]
    df_std_id[dataset] = f_std_id[dataset]
    df_mean_ood[dataset] = f_mean_ood[dataset]
    df_std_ood[dataset] = f_std_ood[dataset]
    df_diff_mean[dataset] = f_diff_mean[dataset]
    df_diff_std[dataset] = f_diff_std[dataset]


formatted_id = df_mean_id.astype(str) + " (" + df_std_id.astype(str) + ")"
formatted_ood = df_mean_ood.astype(str) + " (" + df_std_ood.astype(str) + ")"
formatted_diff = df_diff_mean.astype(str) + " (" + df_diff_std.astype(str) + ")"

In [17]:
combined_df = pd.concat([formatted_id, formatted_ood, formatted_diff], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()

# Define custom orders for each level
split_order = SPLIT_TYPES  # custom order for splits  
performance_order = ["Test (ID)", "Test (OOD)", "Gap"]  # custom order

idx = pd.MultiIndex.from_product([split_order, performance_order], names=['Domain', 'Performance'])

combined_df = combined_df.reindex(idx)
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,CYP1A2,CYP2C9,CYP2C19,CYP2D6,CYP3A4,HIV,AMES,HERG
Domain,Performance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
random,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.8 (0.02),0.67 (0.02),0.88 (0.01)
random,Test (OOD),0.77 (0.01),0.79 (0.01),0.86 (0.01),0.86 (0.01),0.76 (0.01),0.8 (0.02),0.67 (0.01),0.88 (0.01)
random,Gap,-0.0 (0.02),-0.0 (0.01),-0.0 (0.01),-0.0 (0.01),0.0 (0.01),-0.0 (0.03),0.0 (0.02),0.0 (0.01)
scaffold,Test (ID),0.78 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.79 (0.02),0.67 (0.01),0.88 (0.01)
scaffold,Test (OOD),0.76 (0.02),0.77 (0.01),0.85 (0.01),0.85 (0.01),0.75 (0.02),0.77 (0.02),0.64 (0.02),0.84 (0.01)
scaffold,Gap,0.02 (0.02),0.01 (0.01),0.01 (0.02),0.01 (0.02),0.01 (0.03),0.02 (0.04),0.03 (0.02),0.04 (0.01)
scaffold_generic,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.79 (0.02),0.67 (0.02),0.89 (0.01)
scaffold_generic,Test (OOD),0.76 (0.02),0.77 (0.01),0.84 (0.01),0.85 (0.02),0.75 (0.02),0.77 (0.03),0.64 (0.02),0.82 (0.02)
scaffold_generic,Gap,0.01 (0.02),0.01 (0.01),0.01 (0.01),0.01 (0.02),0.01 (0.02),0.03 (0.04),0.03 (0.02),0.07 (0.02)
molecular_weight,Test (ID),0.78 (0.0),0.76 (0.0),0.86 (0.0),0.86 (0.0),0.76 (0.0),0.73 (0.02),0.66 (0.01),0.88 (0.0)


In [18]:
latex_table = combined_df.to_latex(escape=False, index=True, float_format="{:.2f}".format, buf="assets/Model_comparison.tex")

# Performance GAP between ID and OOD (ML and GNN Models separately)

In [3]:
models = [item for sublist in ALL_MODELS for item in sublist]  # get the unique models
metric = "roc_auc"
vmin, vnmax = 0.0, 0.2

f_mean_id ={}
f_std_id = {}
f_mean_ood = {}
f_std_ood = {}
f_diff_mean = {}
f_diff_std = {}
for i, dataset in enumerate(DATASET_NAMES):
    result_subset = results[results["dataset"]==dataset]
    mean_df_id = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    std_df_id = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    mean_df_ood = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    std_df_ood = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    diff_mean = pd.DataFrame(index=models, columns=SPLIT_TYPES)
    diff_std = pd.DataFrame(index=models, columns=SPLIT_TYPES)

    for model in models:
        for split in SPLIT_TYPES:
            mean_id = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"].mean()
            mean_ood = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"].mean()
            std_id = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"].std()
            std_ood = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"].std()
            diff = result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"ID_test_{metric}"] - result_subset[(result_subset["model"] == model) & (result_subset["split"] == split)][f"OOD_test_{metric}"]
            mean_df_id.loc[model, split] = mean_id
            std_df_id.loc[model, split] = std_id
            mean_df_ood.loc[model, split] = mean_ood
            std_df_ood.loc[model, split] = std_ood
            diff_mean.loc[model, split] = diff.mean()
            diff_std.loc[model, split] = diff.std()
    
    # just for the models index, devide them into ML, GNN and PRETRAINED_GNN and calculate average performance for each
    mean_df_id.index = np.where(mean_df_id.index.isin(ML_MODELS), "ML", "GNN")
    std_df_id.index = np.where(std_df_id.index.isin(ML_MODELS), "ML", "GNN")
    mean_df_ood.index = np.where(mean_df_ood.index.isin(ML_MODELS), "ML", "GNN")
    std_df_ood.index = np.where(std_df_ood.index.isin(ML_MODELS), "ML", "GNN")
    diff_mean.index = np.where(diff_mean.index.isin(ML_MODELS), "ML", "GNN")
    diff_std.index = np.where(diff_std.index.isin(ML_MODELS), "ML", "GNN")

    mean_df_id = mean_df_id.groupby(mean_df_id.index).mean()
    mean_df_id = mean_df_id.astype(float)
    f_mean_id[dataset] = mean_df_id.round(2)

    std_df_id = std_df_id.groupby(std_df_id.index).mean()
    std_df_id = std_df_id.astype(float)
    f_std_id[dataset] = std_df_id.round(2)

    mean_df_ood = mean_df_ood.groupby(mean_df_ood.index).mean()
    mean_df_ood = mean_df_ood.astype(float)
    f_mean_ood[dataset] = mean_df_ood.round(2)

    std_df_ood = std_df_ood.groupby(std_df_ood.index).mean()
    std_df_ood = std_df_ood.astype(float)
    f_std_ood[dataset] = std_df_ood.round(2)

    diff_mean = diff_mean.groupby(diff_mean.index).mean()
    diff_mean = diff_mean.astype(float)
    f_diff_mean[dataset] = diff_mean.round(2)

    diff_std = diff_std.groupby(diff_std.index).mean()
    diff_std = diff_std.astype(float)
    f_diff_std[dataset] = diff_std.round(2)

In [4]:
# for f, put split on the rows and dataset on the columns and create new dataframe
df_mean_id_GNN = pd.DataFrame()
df_std_id_GNN = pd.DataFrame()
df_mean_ood_GNN = pd.DataFrame()
df_std_ood_GNN = pd.DataFrame()

df_mean_id_ML = pd.DataFrame()
df_std_id_ML = pd.DataFrame()
df_mean_ood_ML = pd.DataFrame()
df_std_ood_ML = pd.DataFrame()

df_diff_mean_ML = pd.DataFrame()
df_diff_std_ML = pd.DataFrame()
df_diff_mean_GNN = pd.DataFrame()
df_diff_std_GNN = pd.DataFrame()

for dataset in DATASET_NAMES:
    df_mean_id_GNN[dataset] = f_mean_id[dataset].loc["GNN"]
    df_std_id_GNN[dataset] = f_std_id[dataset].loc["GNN"]
    df_mean_ood_GNN[dataset] = f_mean_ood[dataset].loc["GNN"]
    df_std_ood_GNN[dataset] = f_std_ood[dataset].loc["GNN"]

    df_mean_id_ML[dataset] = f_mean_id[dataset].loc["ML"]
    df_std_id_ML[dataset] = f_std_id[dataset].loc["ML"]
    df_mean_ood_ML[dataset] = f_mean_ood[dataset].loc["ML"]
    df_std_ood_ML[dataset] = f_std_ood[dataset].loc["ML"]

    df_diff_mean_ML[dataset] = f_diff_mean[dataset].loc["ML"]
    df_diff_std_ML[dataset] = f_diff_std[dataset].loc["ML"]
    df_diff_mean_GNN[dataset] = f_diff_mean[dataset].loc["GNN"]
    df_diff_std_GNN[dataset] = f_diff_std[dataset].loc["GNN"]

formatted_id_GNN = df_mean_id_GNN.astype(str) + " (" + df_std_id_GNN.astype(str) + ")"
formatted_ood_GNN = df_mean_ood_GNN.astype(str) + " (" + df_std_ood_GNN.astype(str) + ")"
formatted_id_ML = df_mean_id_ML.astype(str) + " (" + df_std_id_ML.astype(str) + ")"
formatted_ood_ML = df_mean_ood_ML.astype(str) + " (" + df_std_ood_ML.astype(str) + ")"
formatted_diff_ML = df_diff_mean_ML.astype(str) + " (" + df_diff_std_ML.astype(str) + ")"
formatted_diff_GNN = df_diff_mean_GNN.astype(str) + " (" + df_diff_std_GNN.astype(str) + ")"

In [6]:
combined_df_ml = pd.concat([formatted_id_ML, formatted_ood_ML, formatted_diff_ML], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()
combined_df_gnn = pd.concat([formatted_id_GNN, formatted_ood_GNN, formatted_diff_GNN], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()
#combined_df = pd.concat([formatted_id_GNN, formatted_ood_GNN], sort=True)
#sorted_df = combined_df.sort_index()

combined_df = pd.concat([combined_df_ml, combined_df_gnn], keys=['Classical_ML', 'GNN']).swaplevel(0, 1).sort_index()
combined_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,CYP1A2,CYP2C9,CYP2C19,CYP2D6,CYP3A4,HIV,AMES,HERG
kmeans,Classical_ML,Gap,0.02 (0.15),0.1 (0.04),0.1 (0.06),0.07 (0.02),0.1 (0.05),0.05 (0.08),0.02 (0.04),0.16 (0.05)
kmeans,Classical_ML,Test (ID),0.77 (0.03),0.8 (0.01),0.85 (0.01),0.86 (0.01),0.78 (0.02),0.81 (0.02),0.65 (0.01),0.91 (0.01)
kmeans,Classical_ML,Test (OOD),0.75 (0.12),0.7 (0.04),0.75 (0.07),0.8 (0.02),0.67 (0.04),0.76 (0.07),0.63 (0.03),0.74 (0.04)
kmeans,GNN,Gap,0.01 (0.13),0.05 (0.04),0.05 (0.04),0.05 (0.02),0.11 (0.03),0.06 (0.11),0.02 (0.03),0.12 (0.03)
kmeans,GNN,Test (ID),0.77 (0.03),0.79 (0.01),0.85 (0.01),0.86 (0.01),0.77 (0.02),0.79 (0.04),0.66 (0.01),0.88 (0.01)
kmeans,GNN,Test (OOD),0.76 (0.11),0.73 (0.04),0.8 (0.04),0.82 (0.02),0.66 (0.02),0.73 (0.08),0.64 (0.02),0.76 (0.03)
max_dissimilarity,Classical_ML,Gap,-0.08 (0.06),0.14 (0.05),0.12 (0.06),0.09 (0.04),0.05 (0.07),0.03 (0.05),0.03 (0.02),0.14 (0.06)
max_dissimilarity,Classical_ML,Test (ID),0.73 (0.01),0.76 (0.01),0.83 (0.01),0.85 (0.01),0.74 (0.01),0.81 (0.02),0.67 (0.02),0.9 (0.0)
max_dissimilarity,Classical_ML,Test (OOD),0.81 (0.06),0.63 (0.05),0.71 (0.06),0.76 (0.04),0.69 (0.06),0.78 (0.05),0.64 (0.01),0.76 (0.06)
max_dissimilarity,GNN,Gap,-0.04 (0.09),0.12 (0.05),0.09 (0.06),0.04 (0.03),0.1 (0.09),0.08 (0.07),0.04 (0.02),0.11 (0.04)


In [7]:
# Define custom orders for each level
split_order = SPLIT_TYPES  # custom order for splits  
model_order = ["Classical_ML", "GNN"]  # custom order for models
performance_order = ["Test (ID)", "Test (OOD)", "Gap"]  # custom order

idx = pd.MultiIndex.from_product([split_order, model_order, performance_order], names=['Domain', 'Model Type', 'Performance'])

combined_df = combined_df.reindex(idx)
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CYP1A2,CYP2C9,CYP2C19,CYP2D6,CYP3A4,HIV,AMES,HERG
Domain,Model Type,Performance,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
random,Classical_ML,Test (ID),0.77 (0.01),0.79 (0.01),0.85 (0.01),0.86 (0.01),0.77 (0.01),0.81 (0.02),0.66 (0.01),0.9 (0.01)
random,Classical_ML,Test (OOD),0.77 (0.01),0.79 (0.01),0.85 (0.01),0.86 (0.01),0.77 (0.01),0.81 (0.02),0.66 (0.01),0.9 (0.01)
random,Classical_ML,Gap,-0.0 (0.02),-0.0 (0.02),-0.01 (0.01),0.0 (0.01),0.0 (0.01),0.0 (0.03),0.0 (0.02),0.0 (0.01)
random,GNN,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.79 (0.02),0.67 (0.02),0.87 (0.01)
random,GNN,Test (OOD),0.78 (0.01),0.78 (0.01),0.86 (0.01),0.86 (0.01),0.76 (0.01),0.79 (0.02),0.67 (0.01),0.87 (0.01)
random,GNN,Gap,-0.0 (0.01),-0.0 (0.01),-0.0 (0.01),-0.0 (0.01),0.0 (0.01),-0.0 (0.03),-0.0 (0.02),0.0 (0.01)
scaffold,Classical_ML,Test (ID),0.77 (0.01),0.79 (0.01),0.85 (0.0),0.85 (0.01),0.77 (0.01),0.8 (0.02),0.66 (0.01),0.9 (0.01)
scaffold,Classical_ML,Test (OOD),0.75 (0.02),0.77 (0.02),0.84 (0.01),0.84 (0.02),0.76 (0.02),0.78 (0.02),0.63 (0.02),0.86 (0.01)
scaffold,Classical_ML,Gap,0.02 (0.03),0.03 (0.02),0.01 (0.02),0.02 (0.02),0.01 (0.02),0.02 (0.04),0.03 (0.03),0.04 (0.01)
scaffold,GNN,Test (ID),0.78 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.78 (0.03),0.67 (0.01),0.88 (0.01)


In [8]:
latex_table = combined_df.to_latex(escape=False, index=True, float_format="{:.2f}".format, buf="assets/ML_GNN_comparison.tex")