# Colab install packages

In [1]:
import torch
print(torch.__version__)

2.8.0+cu126


In [None]:
pyg_url = f"https://data.pyg.org/whl/torch-{torch.__version__}.html"
!pip install pyg-lib torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f $pyg_url
!pip install rdkit

# Process competition test data

In [1]:
!git clone https://github.com/LongHung-Pham/pADME.git
%cd pADME

/content/pADME


In [None]:
!wget https://zenodo.org/records/15582067/files/ADMET.csv -O data/ADMET.csv     # competition data

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/ADMET.csv')
df.head()

Unnamed: 0,CXSMILES,HLM,KSOL,LogD,MDR1-MDCKII,MLM,Molecule Name,Set
0,COC1=CC=CC(Cl)=C1NC(=O)N1CCC[C@H](C(N)=O)C1 |a...,,,0.3,2.0,,ASAP-0032437,Train
1,O=C(NCC(F)F)[C@H](NC1=CC2=C(C=C1Br)CNC2)C1=CC(...,,333.0,2.9,0.2,,ASAP-0031915,Train
2,O=C(NCC(F)F)[C@H](NC1=CC=C2CNCC2=C1)C1=CC(Br)=...,,,0.4,0.5,,ASAP-0031884,Train
3,NC(=O)[C@H]1CCCN(C(=O)CC2=CC=CC3=C2C=CO3)C1 |&...,,376.0,1.0,8.5,,ASAP-0031848,Train
4,CC1=CC(CC(=O)N2CCC[C@H](C(N)=O)C2)=CC=N1 |&1:11|,,375.0,-0.3,0.9,,ASAP-0031813,Train


In [3]:
test = df[df['Set'] == 'Test']
test.to_csv('data/TEST_smiles.csv', index = False, columns = ['CXSMILES'])

In [4]:
from data.datasets import PredictionDataset
from data.data_processing import load_drug_smile
from torch_geometric.loader import DataLoader

xd, smile_graph = load_drug_smile('data/TEST_smiles.csv', smile_col_index = 0)
test_dataset = PredictionDataset(root = 'pytorch_data', dataset = 'TEST_prediction', xd = xd, smile_graph = smile_graph)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = False)

Pre-processed data found: pytorch_data/processed/TEST_prediction.pt, loading ...


# Run prediction (Leaderboard results)

In [None]:
from src.model import GNN_net
import torch

target_cols = ['KSOL', 'LogD', 'HLM', 'MLM', 'MDR1-MDCKII']

# This is a dict mapping a finetune dataset name ('KSOL', 'HLM', 'MLM',...) to existing prediction heads ('y_sol', 'y_clint',...)
name_map = {'KSOL': 'y_sol', 'HLM': 'y_clint', 'MLM': 'y_clint', 'LogD': 'y_logd', 'MDR1-MDCKII': 'y_clint'}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_pred = {}

for tgt in target_cols:
    print(f'Making predictions for: {tgt}')

    model = GNN_net(num_gnn_layers = 3, graph_pooling = 'attention', JK = 'last',
                    h_dim = 256, ffn_dim = 64)
    if torch.cuda.is_available():
        model.load_state_dict(torch.load(f'models/leaderboard_models/best_finetuned_model_{tgt}.pt'))
    else:
        model.load_state_dict(torch.load(f'models/leaderboard_models/best_finetuned_model_{tgt}.pt', map_location=torch.device('cpu')))

    model = model.to(device)
    model.eval()

    total_preds = torch.Tensor()
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            predictions = model(data)
            total_preds = torch.cat((total_preds, predictions[name_map[tgt]].cpu().flatten()), 0)

    if tgt in ['KSOL', 'HLM', 'MLM', 'MDR1-MDCKII']:
      y_pred[tgt] = 10 ** total_preds.numpy()
    else:
      y_pred[tgt] = total_preds.numpy()
    #print(y_pred[tgt])

In [8]:
from evaluation.admet import evaluate_admet_predictions

y_true = {'MLM': test['MLM'].to_numpy(dtype=np.float32),
          'MDR1-MDCKII': test['MDR1-MDCKII'].to_numpy(dtype=np.float32),
          'HLM': test['HLM'].to_numpy(dtype=np.float32),
          'KSOL': test['KSOL'].to_numpy(dtype=np.float32),
          'LogD': test['LogD'].to_numpy(dtype=np.float32)}

scores = evaluate_admet_predictions(y_true, y_pred, method_label = 'GINnet_leaderboard')

for label in ["KSOL", "LogD", "MLM", "HLM", "MDR1-MDCKII", "aggregated"]:
  current = scores[(scores['Target Label'] == label) & (scores['Metric'] == 'mean_absolute_error')]
  mean, std = current['Score'].mean(), current['Score'].std()
  print(f'{label}: {mean} +- {std}')

KSOL: 0.11239795560389756 +- 0.018238932983130594
LogD: 0.33878970277309417 +- 0.024159105950433454
MLM: 0.36574061146378517 +- 0.02738949683661979
HLM: 0.35272448924183847 +- 0.022665707132711436
MDR1-MDCKII: 0.2069622610360384 +- 0.014007537692399053
aggregated: 0.2753230040237308 +- 0.010123638255563833


# Run prediction (Optimized model)

In [None]:
from src.model import GNN_net
import torch

target_cols = ['KSOL', 'LogD', 'HLM', 'MLM', 'MDR1-MDCKII']

# This is a dict mapping a finetune dataset name ('KSOL', 'HLM', 'MLM',...) to existing prediction heads ('y_sol', 'y_clint',...)
name_map = {'KSOL': 'y_sol', 'HLM': 'y_hlm', 'MLM': 'y_mlm', 'LogD': 'y_logd', 'MDR1-MDCKII': 'y_mdck'}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_pred = {}

for tgt in target_cols:
    print(f'Making predictions for: {tgt}')

    model = GNN_net(num_gnn_layers = 4, graph_pooling = 'attention', JK = 'last',
                    h_dim = 512, ffn_dim = 64,
                    task_heads = {'y_sol': 1, 'y_logd': 1, 'y_hlm': 1, 'y_mlm': 1, 'y_mdck': 1})
    if torch.cuda.is_available():
        model.load_state_dict(torch.load(f'models/optimized_models/best_finetuned_model_{tgt}_Novartis.pt'))
    else:
        model.load_state_dict(torch.load(f'models/optimized_models/best_finetuned_model_{tgt}_Novartis.pt', map_location=torch.device('cpu')))

    model = model.to(device)
    model.eval()

    total_preds = torch.Tensor()
    with torch.no_grad():
        for data in test_loader:
            data = data.to(device)
            predictions = model(data)
            total_preds = torch.cat((total_preds, predictions[name_map[tgt]].cpu().flatten()), 0)

    if tgt in ['KSOL', 'HLM', 'MLM', 'MDR1-MDCKII']:
      y_pred[tgt] = 10 ** total_preds.numpy()
    else:
      y_pred[tgt] = total_preds.numpy()
    #print(y_pred[tgt])

In [None]:
from evaluation.admet import evaluate_admet_predictions

y_true = {'MLM': test['MLM'].to_numpy(dtype=np.float32),
          'MDR1-MDCKII': test['MDR1-MDCKII'].to_numpy(dtype=np.float32),
          'HLM': test['HLM'].to_numpy(dtype=np.float32),
          'KSOL': test['KSOL'].to_numpy(dtype=np.float32),
          'LogD': test['LogD'].to_numpy(dtype=np.float32)}

scores = evaluate_admet_predictions(y_true, y_pred, method_label = 'GINnet_leaderboard')

for label in ["KSOL", "LogD", "MLM", "HLM", "MDR1-MDCKII", "aggregated"]:
  current = scores[(scores['Target Label'] == label) & (scores['Metric'] == 'mean_absolute_error')]
  mean, std = current['Score'].mean(), current['Score'].std()
  print(f'{label}: {mean} +- {std}')

KSOL: 0.09794307876005769 +- 0.01868861894914095
LogD: 0.29956936033070086 +- 0.024133632122082544
MLM: 0.33181830966472625 +- 0.025531979512905877
HLM: 0.36429905048012734 +- 0.030295449988248714
MDR1-MDCKII: 0.16070003859698773 +- 0.0105406579993727
aggregated: 0.25086596756651997 +- 0.010306512818795716
