# ADMET Group classification tasks
This notebook contains the benchmark of Zaira-Chem for the ADMET datasets from Therapeutics Data Commons with classification tasks

In [1]:
import os
import pandas as pd
import numpy as np

PREDPATH = "../predictions"

In [2]:
h3d_datasets = ["clint_Human",
                  "clint_Mouse", 
                  "clint_Rat",
                  "cytotoxicity_CHO",
                  "cytotoxicity_HepG2", 
                  "enzymology_PvPI4KB", 
                  "mtb_H37Rv",
                  "plasmodium_NF54", 
                  "solubility_65",
                ]

## Model Evaluation
As an example, we provide the results of 8 folds of ZairaChem models for each dataset. The automated reports generated by ZairaChem, as well as raw data outputs is shared. 

An example model is also provided in the /model folder.

*Some molecules cannot be inferred using the ZairaChem pipeline due to the molecular descriptors used. The results for these molecules is calculated as the mean of all the predictions, and the molecules are saved for manual inspection*

### ZairaChem

In [3]:
zairachem_predictions_list = []
nan_mols = {}

zaira_predictions = {}
truth = {}
for h in h3d_datasets:
    path = os.path.join("../predictions/h3d/zaira-chem", h)
    pred_file = pd.read_csv(os.path.join(path, "output_table.csv"))
    y_pred_test = pred_file["pred-value"]
    smi = []
    for n,y in enumerate(y_pred_test):
        if np.isnan(y):
            smi += [pred_file.loc[n]["input-smiles"]]
    nan_mols[h] = smi
    #replace Nan values by mean values
    arr = np.array(y_pred_test)
    mean_val = np.nanmean(arr)
    arr[np.isnan(arr)] = mean_val
    y_pred_test = arr.tolist()
    zaira_predictions[h] = y_pred_test
    truth[h] = pred_file["true-value"]

In [4]:
from sklearn.metrics import roc_auc_score

zaira_roc_scores = {}
for h in h3d_datasets:
    zaira_roc_scores[h] = roc_auc_score(truth[h], zaira_predictions[h])
zaira_roc_scores

{'clint_Human': 0.662489070241912,
 'clint_Mouse': 0.6230661040787624,
 'clint_Rat': 0.6629588431590656,
 'cytotoxicity_CHO': 0.6416040100250626,
 'cytotoxicity_HepG2': 0.7215496368038741,
 'enzymology_PvPI4KB': 0.6829004329004329,
 'mtb_H37Rv': 0.7560606060606061,
 'plasmodium_NF54': 0.83335935467083,
 'solubility_65': 0.9111399494570043}

### Olinda

In [5]:
import onnx_runner
nan_mols = {}

olinda_predictions = {}
for h in h3d_datasets:
    path = os.path.join("../models/h3d_models/")    
    onnx_model = onnx_runner.onnx_runner(os.path.join(path, "h3d_" + h + ".onnx"))

    zaira_path = os.path.join("../predictions/h3d/zaira-chem", h)
    pred_file = pd.read_csv(os.path.join(zaira_path, "output_table.csv"))
    test_smiles_list = pred_file["input-smiles"].tolist()
    
    y_pred_test = onnx_model.predict(test_smiles_list)
    
    olinda_predictions[h]= y_pred_test

In [6]:
from sklearn.metrics import roc_auc_score

olinda_roc_scores = {}
for h in h3d_datasets:
    olinda_roc_scores[h] = roc_auc_score(truth[h], olinda_predictions[h])
olinda_roc_scores

{'clint_Human': 0.6303555814631303,
 'clint_Mouse': 0.5812236286919831,
 'clint_Rat': 0.657285873192436,
 'cytotoxicity_CHO': 0.6872273275782048,
 'cytotoxicity_HepG2': 0.7070217917675545,
 'enzymology_PvPI4KB': 0.7283549783549783,
 'mtb_H37Rv': 0.8068181818181818,
 'plasmodium_NF54': 0.8313296903460838,
 'solubility_65': 0.9106618400382488}

In [23]:
import pandas as pd
zaira_rocs = [val for val in zaira_roc_scores.values()]
olinda_rocs = [val for val in olinda_roc_scores.values()]

diffs = []
fraction = []
for i in range(len(zaira_rocs)):
    tmp = round(zaira_rocs[i] - olinda_rocs[i], 3)
    if tmp > 0:
        diffs.append("-" + str(abs(tmp)))
    else:
        diffs.append("+" + str(abs(tmp)))

    fraction.append(round(olinda_rocs[i] / zaira_rocs[i], 2))

df = pd.DataFrame(columns=["assay", "zaira-chem auroc", "olinda auroc", "delta", "perc_performance"], data=list(zip(h3d_datasets, zaira_rocs, olinda_rocs, diffs, fraction)))

In [24]:
df

Unnamed: 0,assay,zaira-chem auroc,olinda auroc,delta,perc_performance
0,clint_Human,0.662489,0.630356,-0.032,0.95
1,clint_Mouse,0.623066,0.581224,-0.042,0.93
2,clint_Rat,0.662959,0.657286,-0.006,0.99
3,cytotoxicity_CHO,0.641604,0.687227,0.046,1.07
4,cytotoxicity_HepG2,0.72155,0.707022,-0.015,0.98
5,enzymology_PvPI4KB,0.6829,0.728355,0.045,1.07
6,mtb_H37Rv,0.756061,0.806818,0.051,1.07
7,plasmodium_NF54,0.833359,0.83133,-0.002,1.0
8,solubility_65,0.91114,0.910662,0.0,1.0
