# Analyse the TDC Dataset

In [1]:
from tdc.benchmark_group import admet_group

In [5]:
group = admet_group(path = 'data/')
benchmark = group.get('Caco2_Wang')

Downloading Benchmark Group...
100%|██████████| 1.47M/1.47M [00:01<00:00, 944kiB/s] 
Extracting zip file...
Done!


In [10]:
benchmark['train_val']

Unnamed: 0,Drug_ID,Drug,Y
0,H 95/71,CC(C)NCC(O)COc1ccc(NC=O)cc1,-5.427984
1,H 244/45,CCC(=O)Nc1ccc(OCC(O)CNC(C)C)cc1,-5.219842
2,D-Phe-D-Ala-D-Ser-OH,C[C@H](NC(=O)[C@H](N)Cc1ccccc1)C(=O)N[C@H](CO)...,-6.281999
3,Dexloxiglumide,CCCCCN(CCCOC)C(=O)[C@@H](CCC(=O)O)NC(=O)c1ccc(...,-5.140131
4,Ac-D-phe-NH2,CC(=O)N[C@@H](Cc1ccccc1)C(N)=O,-5.100090
...,...,...,...
723,11,CCCCCCC(N)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N1CCC...,-5.790000
724,Gancyclovir,Nc1nc2c(ncn2COC(CO)CO)c(=O)[nH]1,-6.101228
725,Val-ACV,CC(C)C(N)C(=O)OCCOCn1cnc2c(=O)[nH]c(N)nc21,-5.669776
726,1033-Dextromethorphan (DEM),COc1ccc2c(c1)[C@@]13CCCC[C@@H]1[C@@H](C2)N(C)CC3,-4.628932


In [7]:
from tdc.single_pred import ADME
data = ADME(name = 'Caco2_Wang')
split = data.get_split()

Downloading...
100%|██████████| 82.5k/82.5k [00:00<00:00, 128kiB/s] 
Loading...
Done!


In [11]:
split['valid']

Unnamed: 0,Drug_ID,Drug,Y
0,Raloxifene HCl,O=C(c1ccc(OCCN2CCCCC2)cc1)c1c(-c2ccc(O)cc2)sc2...,-5.722754
1,13,CCOC(=O)c1ccc2c(C(C(=O)NS(=O)(=O)c3ccc(C)cc3OC...,-4.699485
2,5,N#Cc1ccc(NCC(F)(F)c2ccccc2)c(F)c1CC(=O)NCCONC(...,-5.647924
3,-,O=C(O)c1ccncc1,-5.190000
4,4b,Cc1cc(C(=O)Nc2ccc(-c3ccccc3S(N)(=O)=O)cc2F)n(-...,-6.000000
...,...,...,...
86,atropine,CN1[C@H]2CC[C@@H]1CC(OC(=O)C(CO)c1ccccc1)C2,-4.700000
87,Guanabenz,NC(N)=NN=Cc1c(Cl)cccc1Cl,-4.330000
88,4,CN(C(=O)[C@H](Cc1ccc(CN)cc1)NS(=O)(=O)c1ccc2cc...,-4.958607
89,20(S)-camptothecin (CPT),CC[C@]1(O)C(=O)OCc2c1cc1n(c2=O)-c2cc3ccccc3nc2C1,-4.331849


In [13]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from rdkit import Chem
from rdkit.Chem import AllChem
from torch_geometric.data import Data, DataLoader

In [14]:
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Node features: Atom properties (e.g., atomic number)
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([atom.GetAtomicNum()])
    x = torch.tensor(atom_features, dtype=torch.float)
    
    # Edge indices: Bonds between atoms
    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])  # Undirected graph
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    return Data(x=x, edge_index=edge_index)

In [46]:
def prepare_data():
    # Load TDC ADMET Benchmark Group
    group = admet_group(path='data/')
    datasets = group.dataset_names  # 22 datasets
    
    # Task types (example mapping, adjust per TDC documentation)
    task_types = ['regression'] * 10 + ['classification'] * 12  # 10 reg, 12 class
    
    # Combine training data
    train_data = []
    for i, dataset_name in enumerate(datasets):
        dataset = group.get(dataset_name)
        train_df = dataset['train_val']
        for _, row in train_df.iterrows():
            graph = smiles_to_graph(row['Drug'])
            if graph is not None:
                graph.task_id = i  # Assign task_id
                graph.task_type = 'regression' if type(row['Y']) == float else 'classification'
                graph.y = torch.tensor([row['Y']], dtype=torch.float if type(row['Y']) == float else torch.int)
                train_data.append(graph)
    
    return train_data, task_types

In [47]:
train_data, task_types = prepare_data()

Found local copy...


In [48]:
import polars as pl

# Extract relevant data from the list of Data objects
data_list = []
for data_obj in train_data:
    data_dict = {
        'x': data_obj.x.tolist(),  # Convert node features to list
        'edge_index': data_obj.edge_index.tolist(),  # Convert edge indices to list
        'task_id': data_obj.task_id,
        'task_type': data_obj.task_type,
        'y': data_obj.y.item() # Get the scalar value from the tensor
    }
    data_list.append(data_dict)

# Create the Polars DataFrame
df = pl.DataFrame(data_list)

# Print or further process the DataFrame
print(df)

shape: (65_430, 5)
┌─────────────────────────┬──────────────────────────────┬─────────┬────────────┬───────────┐
│ x                       ┆ edge_index                   ┆ task_id ┆ task_type  ┆ y         │
│ ---                     ┆ ---                          ┆ ---     ┆ ---        ┆ ---       │
│ list[list[f64]]         ┆ list[list[i64]]              ┆ i64     ┆ str        ┆ f64       │
╞═════════════════════════╪══════════════════════════════╪═════════╪════════════╪═══════════╡
│ [[6.0], [6.0], … [6.0]] ┆ [[0, 1, … 9], [1, 0, … 17]]  ┆ 0       ┆ regression ┆ -5.427984 │
│ [[6.0], [6.0], … [6.0]] ┆ [[0, 1, … 5], [1, 0, … 19]]  ┆ 0       ┆ regression ┆ -5.219841 │
│ [[6.0], [6.0], … [8.0]] ┆ [[0, 1, … 8], [1, 0, … 13]]  ┆ 0       ┆ regression ┆ -6.281999 │
│ [[6.0], [6.0], … [6.0]] ┆ [[0, 1, … 22], [1, 0, … 29]] ┆ 0       ┆ regression ┆ -5.140131 │
│ [[6.0], [6.0], … [8.0]] ┆ [[0, 1, … 6], [1, 0, … 11]]  ┆ 0       ┆ regression ┆ -5.10009  │
│ …                       ┆ …            