# Import Libraries

In [2]:
# Import libraries
%matplotlib inline
import json
import os
import sys
import yaml
from typing import List, Tuple

import datamol as dm
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import cm
from rdkit import Chem
from rdkit.Chem import Draw, rdFMCS
from rdkit.Chem.Draw import IPythonConsole

repo_path = os.path.dirname(os.path.abspath(""))
CHECKOUT_PATH = repo_path
DATASET_PATH = os.path.join(repo_path, "datasets")

os.chdir(CHECKOUT_PATH)
sys.path.insert(0, CHECKOUT_PATH)

# Import alinemol 
from alinemol.preprocessing import drop_duplicates, standardize_smiles
from alinemol.splitters.splits import MolecularLogPSplit
from alinemol.utils.split_utils import (EmpiricalKernelMapTransformer,
                                      convert_to_default_feats_if_smiles,
                                      get_scaffold)
from alinemol.utils import compare_rankings



Draw.SetComicMode(IPythonConsole.drawOptions)

In [3]:
# Load the configuration file (wich contains datasets, models, and splitting)
CFG = yaml.safe_load(open(os.path.join(DATASET_PATH, "config.yml"), "r"))

ML_MODELS: List = CFG["models"]["ML"]
SCRATCH_GNN_MODELS: List = CFG["models"]["GNN"]["scratch"]
PRETRAINED_GNN_MODELS: List = CFG["models"]["GNN"]["pretrained"]
GNN_MODELS: List = SCRATCH_GNN_MODELS + PRETRAINED_GNN_MODELS
ALL_MODELS: List[List] = [ML_MODELS, SCRATCH_GNN_MODELS, PRETRAINED_GNN_MODELS]
DATASET_NAMES: List = CFG["datasets"]["TDC"]
SPLIT_TYPES: List = CFG["splitting"]   

# read the results that are saved in the results folder. This is used for the visualization
results = pd.read_csv(os.path.join("classification_results", "TDC", "results.csv")) 
results["model_type"] = results['model'].apply(lambda x: "Classical_ML" if x in ML_MODELS else "GNN")
metric_mapping = {'accuracy': 'Accuracy', 'roc_auc': 'ROC-AUC', 'pr_auc': 'PR-AUC'}


# Performance GAP between ID and OOD (ALL Models)

In [92]:
metric = "roc_auc"
mean_df_id = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_id = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
mean_df_ood = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_ood = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_mean = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_std = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)

for dataset in DATASET_NAMES:
    for splits in SPLIT_TYPES:
        df = results[(results["dataset"] == dataset) & (results["split"] == splits)]
        mean_df_id.loc[splits, dataset] = df[f"ID_test_{metric}"].mean()
        std_df_id.loc[splits, dataset] = df[f"ID_test_{metric}"].std()
        mean_df_ood.loc[splits, dataset] = df[f"OOD_test_{metric}"].mean()
        std_df_ood.loc[splits, dataset] = df[f"OOD_test_{metric}"].std()
        diff_mean.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).mean()
        diff_std.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).std()
        

In [93]:
#diff_mean.astype(float).round(2).__class__

mean_df_id = mean_df_id.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_id = std_df_id.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_id = mean_df_id + " (" + std_df_id + ")"

mean_df_ood = mean_df_ood.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_ood = std_df_ood.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_ood = mean_df_ood + " (" + std_df_ood + ")"

diff_mean = diff_mean.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
diff_std = diff_std.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_diff = diff_mean + " (" + diff_std + ")"

combined_df = pd.concat([df_id, df_ood, df_diff], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()

# Define custom orders for each level
split_order = SPLIT_TYPES  # custom order for splits  
performance_order = ["Test (ID)", "Test (OOD)", "Gap"]  # custom order

idx = pd.MultiIndex.from_product([split_order, performance_order], names=['Domain', 'Performance'])
combined_df = combined_df.reindex(idx)
combined_df

Unnamed: 0_level_0,Unnamed: 1_level_0,CYP1A2,CYP2C9,CYP2C19,CYP2D6,CYP3A4,HIV,AMES,HERG
Domain,Performance,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
random,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.80 (0.02),0.67 (0.02),0.88 (0.02)
random,Test (OOD),0.77 (0.01),0.79 (0.01),0.86 (0.01),0.86 (0.01),0.76 (0.02),0.80 (0.02),0.67 (0.02),0.88 (0.02)
random,Gap,-0.00 (0.02),-0.00 (0.01),-0.00 (0.01),-0.00 (0.01),0.00 (0.01),-0.00 (0.03),0.00 (0.02),0.00 (0.01)
scaffold,Test (ID),0.78 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.02),0.76 (0.02),0.79 (0.03),0.67 (0.02),0.88 (0.02)
scaffold,Test (OOD),0.76 (0.02),0.77 (0.02),0.85 (0.01),0.85 (0.02),0.75 (0.02),0.77 (0.03),0.64 (0.02),0.84 (0.02)
scaffold,Gap,0.02 (0.02),0.01 (0.02),0.01 (0.02),0.01 (0.02),0.01 (0.03),0.02 (0.04),0.03 (0.02),0.04 (0.01)
scaffold_generic,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.79 (0.03),0.67 (0.02),0.89 (0.02)
scaffold_generic,Test (OOD),0.76 (0.02),0.77 (0.01),0.84 (0.01),0.85 (0.02),0.75 (0.02),0.77 (0.04),0.64 (0.02),0.82 (0.02)
scaffold_generic,Gap,0.01 (0.02),0.01 (0.01),0.01 (0.01),0.01 (0.02),0.01 (0.02),0.03 (0.04),0.03 (0.02),0.07 (0.02)
molecular_weight,Test (ID),0.78 (0.01),0.76 (0.01),0.86 (0.01),0.86 (0.01),0.76 (0.01),0.73 (0.03),0.66 (0.01),0.88 (0.02)


In [94]:
latex_table = combined_df.to_latex(escape=False, index=True, float_format="{:.2f}".format, buf="assets/Model_comparison.tex")

# Performance GAP between ID and OOD (ML and GNN Models separately)

In [98]:
metric = "roc_auc"
mean_df_id_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_id_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
mean_df_id_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_id_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
mean_df_ood_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_ood_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
mean_df_ood_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
std_df_ood_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_mean_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_std_ML = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_mean_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)
diff_std_GNN = pd.DataFrame(index=SPLIT_TYPES, columns=DATASET_NAMES)

for dataset in DATASET_NAMES:
    for splits in SPLIT_TYPES:
        for model_type in ["Classical_ML", "GNN"]:
            df = results[(results["dataset"] == dataset) & (results["split"] == splits) & (results["model_type"] == model_type)]
            if model_type == "Classical_ML":
                mean_df_id_ML.loc[splits, dataset] = df[f"ID_test_{metric}"].mean()
                std_df_id_ML.loc[splits, dataset] = df[f"ID_test_{metric}"].std()
                mean_df_ood_ML.loc[splits, dataset] = df[f"OOD_test_{metric}"].mean()
                std_df_ood_ML.loc[splits, dataset] = df[f"OOD_test_{metric}"].std()
                diff_mean_ML.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).mean()
                diff_std_ML.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).std()
            else:
                mean_df_id_GNN.loc[splits, dataset] = df[f"ID_test_{metric}"].mean()
                std_df_id_GNN.loc[splits, dataset] = df[f"ID_test_{metric}"].std()
                mean_df_ood_GNN.loc[splits, dataset] = df[f"OOD_test_{metric}"].mean()
                std_df_ood_GNN.loc[splits, dataset] = df[f"OOD_test_{metric}"].std()
                diff_mean_GNN.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).mean()
                diff_std_GNN.loc[splits, dataset] = (df[f"ID_test_{metric}"] - df[f"OOD_test_{metric}"]).std()

In [99]:
mean_df_id_ML = mean_df_id_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_id_ML = std_df_id_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_id_ML = mean_df_id_ML + " (" + std_df_id_ML + ")"
mean_df_ood_ML = mean_df_ood_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_ood_ML = std_df_ood_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_ood_ML = mean_df_ood_ML + " (" + std_df_ood_ML + ")"
diff_mean_ML = diff_mean_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
diff_std_ML = diff_std_ML.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_diff_ML = diff_mean_ML + " (" + diff_std_ML + ")"


mean_df_id_GNN = mean_df_id_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_id_GNN = std_df_id_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_id_GNN = mean_df_id_GNN + " (" + std_df_id_GNN + ")"
mean_df_ood_GNN = mean_df_ood_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
std_df_ood_GNN = std_df_ood_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_ood_GNN = mean_df_ood_GNN + " (" + std_df_ood_GNN + ")"
diff_mean_GNN = diff_mean_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
diff_std_GNN = diff_std_GNN.map(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
df_diff_GNN = diff_mean_GNN + " (" + diff_std_GNN + ")"

combined_df_ml = pd.concat([df_id_ML, df_ood_ML, df_diff_ML], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()
combined_df_gnn = pd.concat([df_id_GNN, df_ood_GNN, df_diff_GNN], keys=['Test (ID)', 'Test (OOD)', 'Gap']).swaplevel(0, 1).sort_index()

combined_df = pd.concat([combined_df_ml, combined_df_gnn], keys=['Classical_ML', 'GNN']).swaplevel(0, 1).sort_index()
# Define custom orders for each level
split_order = SPLIT_TYPES  # custom order for splits  
model_order = ["Classical_ML", "GNN"]  # custom order for models
performance_order = ["Test (ID)", "Test (OOD)", "Gap"]  # custom order

idx = pd.MultiIndex.from_product([split_order, model_order, performance_order], names=['Domain', 'Model Type', 'Performance'])

combined_df = combined_df.reindex(idx)
combined_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CYP1A2,CYP2C9,CYP2C19,CYP2D6,CYP3A4,HIV,AMES,HERG
Domain,Model Type,Performance,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
random,Classical_ML,Test (ID),0.77 (0.01),0.79 (0.02),0.85 (0.01),0.86 (0.01),0.77 (0.01),0.81 (0.02),0.66 (0.02),0.90 (0.01)
random,Classical_ML,Test (OOD),0.77 (0.02),0.79 (0.01),0.85 (0.01),0.86 (0.01),0.77 (0.01),0.81 (0.02),0.66 (0.02),0.90 (0.01)
random,Classical_ML,Gap,-0.00 (0.02),-0.00 (0.01),-0.01 (0.01),0.00 (0.01),0.00 (0.01),0.00 (0.02),0.00 (0.02),0.00 (0.01)
random,GNN,Test (ID),0.77 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.01),0.76 (0.01),0.79 (0.02),0.67 (0.02),0.87 (0.02)
random,GNN,Test (OOD),0.78 (0.01),0.78 (0.01),0.86 (0.01),0.86 (0.01),0.76 (0.02),0.79 (0.02),0.67 (0.01),0.87 (0.02)
random,GNN,Gap,-0.00 (0.01),-0.00 (0.01),-0.00 (0.01),-0.00 (0.01),0.00 (0.01),-0.00 (0.03),-0.00 (0.02),0.00 (0.01)
scaffold,Classical_ML,Test (ID),0.77 (0.01),0.79 (0.01),0.85 (0.00),0.85 (0.01),0.77 (0.01),0.80 (0.02),0.66 (0.02),0.90 (0.01)
scaffold,Classical_ML,Test (OOD),0.75 (0.02),0.77 (0.02),0.84 (0.01),0.84 (0.01),0.76 (0.02),0.78 (0.02),0.63 (0.03),0.86 (0.01)
scaffold,Classical_ML,Gap,0.02 (0.02),0.03 (0.02),0.01 (0.01),0.02 (0.02),0.01 (0.02),0.02 (0.04),0.03 (0.02),0.04 (0.01)
scaffold,GNN,Test (ID),0.78 (0.01),0.78 (0.01),0.85 (0.01),0.85 (0.02),0.76 (0.02),0.78 (0.03),0.67 (0.01),0.88 (0.02)


In [100]:
latex_table = combined_df.to_latex(escape=False, index=True, float_format="{:.2f}".format, buf="assets/ML_GNN_comparison.tex")

# Correlation between Splitters (Ranking Splittrers)

In [34]:
dist_df = pd.read_csv(os.path.join(DATASET_PATH, "TDC", "nearest_distances.csv"))
jaccard_df = dist_df.groupby(["split"])["tanimoto"].median().reset_index()
tmd_df = dist_df.groupby(["split"])["tmd"].median().reset_index()

metric = 'roc_auc'
metric_mapping = {'accuracy': 'Accuracy', 'roc_auc': 'ROC-AUC', 'pr_auc': 'PR-AUC'}

diff = results[f"ID_test_{metric}"] - results[f"OOD_test_{metric}"]
results["diff"] = diff

# groupby based on split and model_type
grouped = results.groupby(["split", "model_type"])["diff"].median().reset_index()
grouped_ml = grouped[grouped["model_type"] == "Classical_ML"]
grouped_gnn = grouped[grouped["model_type"] == "GNN"]

categories = jaccard_df["split"].tolist()

condition1 = jaccard_df["tanimoto"].tolist()
condition2 = tmd_df["tmd"].tolist()
condition3 = grouped_ml["diff"].tolist()
condition4 = grouped_gnn["diff"].tolist()

# for all pairwise comparisons of conditions, calculate the spearman correlation and kendall tau
all_conditions = [condition1, condition2, condition3, condition4]
all_pairs = [(i, j) for i in range(len(all_conditions)) for j in range(i+1, len(all_conditions))]
all_pairs_conditions = [(all_conditions[i], all_conditions[j]) for i, j in all_pairs]

corr ={}
for i, j in all_pairs:
    c1, c2 = all_conditions[i], all_conditions[j]
    r = compare_rankings(c1, c2, categories)
    spearman, kendall = r["spearman_correlation"], r["kendall_tau"]
    print(f"Pairwise comparison between conditions {i} and {j}:")
    print(f"Pairwise comparison between conditions {c1} and {c2}:")
    print(f"Spearman correlation: {spearman:.3f}")
    print(f"Kendall tau: {kendall:.3f}")
    print("\n")


Pairwise comparison between conditions 0 and 1:
Pairwise comparison between conditions [0.6714285714285714, 0.6794871794871795, 0.631578947368421, 0.6307692307692307, 0.6825396825396826, 0.5774647887323944, 0.6103896103896104, 0.6142857142857143] and [163.63, 176.13, 175.37, 237.65, 152.27, 120.56, 133.94, 138.21]:
Spearman correlation: 0.595
Kendall tau: 0.429


Pairwise comparison between conditions 0 and 2:
Pairwise comparison between conditions [0.6714285714285714, 0.6794871794871795, 0.631578947368421, 0.6307692307692307, 0.6825396825396826, 0.5774647887323944, 0.6103896103896104, 0.6142857142857143] and [0.07947796702675464, 0.06905058792194707, 0.05643878315723588, 0.03969601668821565, 0.08337859035966705, -0.0006075039333802157, 0.022156833333412784, 0.018824789490587412]:
Spearman correlation: 0.952
Kendall tau: 0.857


Pairwise comparison between conditions 0 and 3:
Pairwise comparison between conditions [0.6714285714285714, 0.6794871794871795, 0.631578947368421, 0.6307692307

In [27]:
len(all_pairs)

16