In [3]:
!pwd

/raid/home/fdivaler/CW


In [1]:
from hydra import initialize, compose

# Initialize Hydra with the config folder
# GlobalHydra.instance().clear()
initialize(config_path="config", job_name="nb")

# Load the config file
cfg = compose(config_name="config")

# Now cfg is your OmegaConf DictConfig object
print(cfg)




{'models': {'gnn_saving_dir': '', 'gnn_name': 'gat', 'n_heads': 3, 'param': {'sider': {'learning_rate': 0.001, 'weight_decay': 0.0005, 'milestones': 'None', 'gamma': 'None', 'batch_size': 64, 'num_epochs': 200, 'num_early_stop': 20, 'gnn_latent_dim': [128, 128, 128], 'gnn_dropout': 0.0, 'add_self_loop': True, 'gcn_adj_normalization': True, 'gnn_emb_normalization': False, 'graph_classification': True, 'node_classification': False, 'gnn_nonlinear': 'relu', 'readout': 'max', 'fc_latent_dim': [], 'fc_dropout': 0.0, 'fc_nonlinear': 'relu'}, 'hiv': {'learning_rate': 0.001, 'weight_decay': 0.0005, 'milestones': 'None', 'gamma': 'None', 'batch_size': 64, 'num_epochs': 200, 'num_early_stop': 20, 'gnn_latent_dim': [128, 128, 128], 'gnn_dropout': 0.0, 'add_self_loop': True, 'gcn_adj_normalization': True, 'gnn_emb_normalization': False, 'graph_classification': True, 'node_classification': False, 'gnn_nonlinear': 'relu', 'readout': 'sum', 'fc_latent_dim': [128], 'fc_dropout': 0.0, 'fc_nonlinear': '

In [3]:
from dig.xgraph.dataset import MoleculeDataset, SynGraphDataset, SentiGraphDataset, BA_LRP
from dataset import get_dataset
d = get_dataset(dataset_root=cfg.datasets.dataset_root,
                          dataset_name=cfg.datasets.dataset_name)

In [4]:
d[0]

Data(x=[20, 9], edge_index=[2, 40], edge_attr=[40, 3], y=[1, 1], smiles='[Cl].CC(C)NCC(O)COc1cccc2ccccc12')

In [5]:
# dataset_custom.py
import os
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
# RDKit
from rdkit import Chem

ATOM_LIST = [1, 6, 7, 8, 9, 15, 16, 17, 35, 53]  # H, C, N, O, F, P, S, Cl, Br, I
EDGE_FEAT_DIM = 9  # SINGLE, DOUBLE, TRIPLE, AROMATIC, conj, ring, stereo-none, stereo-Z, stereo-E


def one_hot(x, xs):
    return [1.0 if x == s else 0.0 for s in xs]

# def atom_features(atom):
#     Z = atom.GetAtomicNum()
#     atom_onehot = [1.0 if Z == z else 0.0 for z in ATOM_LIST] + [0.0 if Z in ATOM_LIST else 1.0]
#     degree      = one_hot(atom.GetTotalDegree(), [0, 1, 2, 3, 4, 5])
#     num_hs      = one_hot(atom.GetTotalNumHs(), [0, 1, 2, 3, 4])
#     aromatic    = [1.0 if atom.GetIsAromatic() else 0.0]
#     charge      = [float(atom.GetFormalCharge())]
#     return atom_onehot + degree + num_hs + aromatic + charge  # -> float features

from rdkit import Chem

# ---- atoms ----
HYB_LIST = [
    Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
    Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.SP3D,
    Chem.rdchem.HybridizationType.SP3D2
]
CHIR_LIST = [
    Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW
]

def clip_one_hot(val, bins):
    val = val if val in bins else max(bins)   # bucket “>=max”
    return [1.0 if val == b else 0.0 for b in bins]

def atom_features(a: Chem.Atom):
    Z = a.GetAtomicNum()
    atom_type = [1.0 if Z == z else 0.0 for z in ATOM_LIST] + [0.0 if Z in ATOM_LIST else 1.0]
    degree    = clip_one_hot(a.GetTotalDegree(), [0,1,2,3,4,5])
    num_hs    = clip_one_hot(a.GetTotalNumHs(), [0,1,2,3,4])
    aromatic  = [1.0 if a.GetIsAromatic() else 0.0]
    formal    = [float(a.GetFormalCharge())]
    in_ring   = [1.0 if a.IsInRing() else 0.0]
    hybrid    = [1.0 if a.GetHybridization()==h else 0.0 for h in HYB_LIST]
    chir      = [1.0 if a.GetChiralTag()==c else 0.0 for c in CHIR_LIST]
    mass      = [a.GetMass()/100.0]  # mild scale

    return atom_type + degree + num_hs + aromatic + formal + in_ring + hybrid + chir + mass

def _largest_fragment(mol: Chem.Mol) -> Chem.Mol:
    """Keep the largest fragment (handles salts / multi-fragment SMILES)."""
    frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
    return max(frags, key=lambda m: m.GetNumAtoms()) if len(frags) > 1 else mol

# ---- bonds (edge_attr) ----
BOND_LIST = [
    Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
    Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC
]

def bond_features(b: Chem.Bond):
    btype = [1.0 if b.GetBondType()==t else 0.0 for t in BOND_LIST]
    conj  = [1.0 if b.GetIsConjugated() else 0.0]
    ring  = [1.0 if b.IsInRing() else 0.0]
    stereo = [
        1.0 if b.GetStereo()==Chem.rdchem.BondStereo.STEREONONE else 0.0,
        1.0 if b.GetStereo()==Chem.rdchem.BondStereo.STEREOZ else 0.0,
        1.0 if b.GetStereo()==Chem.rdchem.BondStereo.STEREOE else 0.0,
    ]
    return btype + conj + ring + stereo


class ChEMBLActivityDataset(InMemoryDataset):
    """
    Expects a CSV at:   {root}/chembl_ic50/raw/data.csv
    with at least columns: 'ISOMERIC SMILES' and 'pAct'
    Labels are binary: y = 1 if pAct >= threshold else 0
    """
    def __init__(
        self,
        root,
        csv_filename='data.csv',
        id_col = 'ID',
        smiles_col='ISOMERIC SMILES',
        activity_col='pAct',
        threshold=6.0,                   # pAct >= 6 is a common activity cutoff
        transform=None, pre_transform=None, pre_filter=None
    ):
        self.csv_filename = csv_filename
        self.id_col = id_col
        self.smiles_col   = smiles_col
        self.activity_col = activity_col
        self.threshold    = float(threshold)
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])

    # ----- properties expected by your code -----
    @property
    def name(self):
        return 'chembl_ic50'

    @property
    def num_classes(self):
        return 2

    # ----- required InMemoryDataset API -----
    @property
    def raw_file_names(self):
        return [self.csv_filename]

    @property
    def processed_file_names(self):
        return ['data.pt']

    # def process(self):
    #     df = pd.read_csv(self.raw_paths[0])
    #     data_list = []

    #     for _, row in df.iterrows():
    #         smi = str(row[self.smiles_col])
    #         mol = Chem.MolFromSmiles(smi)
    #         if mol is None:
    #             continue

    #         # label
    #         id = str(row[self.id_col])
    #         pact = float(row[self.activity_col])
    #         y = torch.tensor([1 if pact >= self.threshold else 0], dtype=torch.long)

    #         # nodes
    #         x = torch.tensor([atom_features(a) for a in mol.GetAtoms()], dtype=torch.float)
    #         # edges (undirected)
    #         edge_index = []
    #         for b in mol.GetBonds():
    #             i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
    #             edge_index.append([i, j]); edge_index.append([j, i])
    #         edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous() if len(edge_index) \
    #                      else torch.empty((2, 0), dtype=torch.long)

    #         data = Data(x=x, edge_index=edge_index, y=y)
    #         data_list.append(data)

    #     if self.pre_filter is not None:
    #         data_list = [d for d in data_list if self.pre_filter(d)]
    #     if self.pre_transform is not None:
    #         data_list = [self.pre_transform(d) for d in data_list]

    #     data, slices = self.collate(data_list)
    #     torch.save((data, slices), self.processed_paths[0])
    def process(self):
        df = pd.read_csv(self.raw_paths[0])

        data_list = []
        for _, row in df.iterrows():
            smi = str(row[self.smiles_col])
            pact_val = row[self.activity_col]

            # label: binary from pAct threshold
            try:
                pact = float(pact_val)
            except Exception:
                continue
            y = torch.tensor([1 if pact >= self.threshold else 0], dtype=torch.long)

            # robust SMILES -> RDKit mol
            mol = Chem.MolFromSmiles(smi, sanitize=True)
            if mol is None:
                continue
            if '.' in smi:
                mol = _largest_fragment(mol)

            # node features
            x = torch.tensor([atom_features(a) for a in mol.GetAtoms()], dtype=torch.float)

            # edges + edge_attr (undirected)
            ei, ea = [], []
            for b in mol.GetBonds():
                i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
                f = bond_features(b)
                ei += [[i, j], [j, i]]
                ea += [f, f]

            if ei:
                edge_index = torch.tensor(ei, dtype=torch.long).t().contiguous()
                edge_attr  = torch.tensor(ea, dtype=torch.float)
            else:
                edge_index = torch.empty((2, 0), dtype=torch.long)
                edge_attr  = torch.empty((0, EDGE_FEAT_DIM), dtype=torch.float)

            data = Data(x=x, edge_index=edge_index, y=y, edge_attr=edge_attr)  # <-- always set

            # if edge_attr is not None:
            #     data.edge_attr = edge_attr

            # optional filtering / transforms
            if self.pre_filter is not None and not self.pre_filter(data):
                continue
            if self.pre_transform is not None:
                data = self.pre_transform(data)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])


In [6]:
# --- prerequisites ---
# pip install rdkit-pypi torch-geometric pandas scikit-learn

import os
import pandas as pd
import os
import torch
import numpy as np
from torch.utils.data import random_split, Subset
from torch_geometric.loader import DataLoader
from dig.xgraph.dataset import MoleculeDataset, SynGraphDataset, SentiGraphDataset, BA_LRP
from torch import default_generator
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from sklearn.model_selection import train_test_split

def get_dataset(dataset_root, dataset_name, pre_filter=None, threshold=6.0):
    if dataset_name.lower() in list(MoleculeDataset.names.keys()):
        return MoleculeDataset(root=dataset_root, name=dataset_name, pre_filter=pre_filter)
    elif dataset_name.lower() in ['graph_sst2', 'graph_sst5', 'twitter']:
        return SentiGraphDataset(root=dataset_root, name=dataset_name)
    elif dataset_name.lower() in list(SynGraphDataset.names.keys()):
        return SynGraphDataset(root=dataset_root, name=dataset_name)
    elif dataset_name.lower() in ['ba_lrp']:
        return BA_LRP(root=dataset_root)
    # >>> add this:
    elif dataset_name.lower() in ['chembl_ic50']:
        return ChEMBLActivityDataset(root=os.path.join(dataset_root, 'chembl_ic50'),threshold=threshold)
    else:
        raise ValueError(f"{dataset_name} is not defined.")



def get_dataloader(dataset, batch_size, stratified, random_split_flag=True, data_split_ratio=None, seed=2):
    """
    Args:
        dataset:
        batch_size: int
        random_split_flag: bool
        data_split_ratio: list, training, validation and testing ratio
        seed: random seed to split the dataset randomly
    Returns:
        a dictionary of training, validation, and testing dataLoader
    """
    dataloader = dict()
    
    if not stratified:
        if not random_split_flag and hasattr(dataset, 'supplement'):
            assert 'split_indices' in dataset.supplement.keys(), "split idx"
            split_indices = dataset.supplement['split_indices']
            train_indices = torch.where(split_indices == 0)[0].numpy().tolist()
            dev_indices = torch.where(split_indices == 1)[0].numpy().tolist()
            test_indices = torch.where(split_indices == 2)[0].numpy().tolist()

            train = Subset(dataset, train_indices)
            eval = Subset(dataset, dev_indices)
            test = Subset(dataset, test_indices)
        else:
            num_train = int(data_split_ratio[0] * len(dataset))
            num_eval = int(data_split_ratio[1] * len(dataset))
            num_test = len(dataset) - num_train - num_eval

            train, eval, test = random_split(dataset,
                                             lengths=[num_train, num_eval, num_test],
                                             generator=default_generator)
            
        dataloader['train'] = DataLoader(train, batch_size=batch_size, shuffle=True, drop_last=True)
        dataloader['eval'] = DataLoader(eval, batch_size=batch_size, shuffle=False, drop_last=True)
        dataloader['test'] = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)
    
    else:
        
        targets = []
        for sample in dataset:
            targets.append(torch.argmax(sample.y).item())

        train_idx, test_idx = train_test_split(range(len(targets)),
                                                test_size=(1-data_split_ratio[0]),
                                                random_state=seed,
                                                shuffle=True,
                                                stratify=targets)

        test_targets = []
        for idx in test_idx:
            test_targets.append(targets[idx])

        if data_split_ratio[1] == data_split_ratio[2]:

            valid_idx, test_idx = train_test_split(range(len(test_targets)),
                                                    test_size=0.5,
                                                    random_state=seed,
                                                    shuffle=True,
                                                    stratify=test_targets)

        train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
        test_sampler = torch.utils.data.SubsetRandomSampler(test_idx)

        dataloader = dict()
        dataloader['train'] = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, drop_last=True)
        dataloader['test'] = DataLoader(dataset, batch_size=1, sampler=test_sampler, drop_last=True)

        if data_split_ratio[1] == data_split_ratio[2]:
            valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)
            dataloader['eval'] = DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler, drop_last=True)
    
    return dataloader
# (If you want to pass a custom pAct threshold, instantiate dataset_custom.ChEMBLActivityDataset directly.)

def build_chembl_dataset_from_df(df: pd.DataFrame, dataset_root: str, threshold=6.0) -> object:
    """
    Saves `df` to the path expected by ChEMBLActivityDataset and returns the dataset.
    df must contain at least: 'ISOMERIC SMILES' and 'pAct'.
    """
    # keep only the columns the dataset needs
    req_cols = ['ID','ISOMERIC SMILES','MW','pAct']
    missing = [c for c in req_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    df2 = df[req_cols].copy()
    # basic cleaning
    df2 = df2.dropna(subset=req_cols)
    df2['pAct'] = pd.to_numeric(df2['pAct'], errors='coerce')
    df2 = df2.dropna(subset=['pAct'])

    # write CSV to the expected raw location: <root>/chembl_ic50/raw/data.csv
    raw_dir = os.path.join(dataset_root, 'chembl_ic50', 'raw')
    os.makedirs(raw_dir, exist_ok=True)
    csv_path = os.path.join(raw_dir, 'data.csv')
    df2.to_csv(csv_path, index=False)

    # build the dataset via your get_dataset() router
    dataset = get_dataset(dataset_root=dataset_root, dataset_name='chembl_ic50', threshold=threshold)
    return dataset


In [7]:
import pickle
imported = pickle.load(open('./transferCW/Selezione_dati_attivita.pickle', 'rb'))

In [8]:
t = torch.tensor(imported['pAct'])
print(t.mean(),"+-",t.std())
threshold = t.mean()+t.std()
print(threshold)

tensor(6.5079, dtype=torch.float64) +- tensor(1.2877, dtype=torch.float64)
tensor(7.7956, dtype=torch.float64)


In [15]:
# ---------- usage ----------
# Suppose you already have a DataFrame `df` like in your screenshot.
dataset_root = "./data"   # choose where to store processed files
dataset = build_chembl_dataset_from_df(imported, dataset_root, threshold=threshold)

# Dataloaders (graph classification)
loaders = get_dataloader(
    dataset=dataset,
    batch_size=32,
    stratified=False,                 # IMPORTANT: labels are scalar 0/1; don't use the one-hot stratified branch
    random_split_flag=True,
    data_split_ratio=[0.8, 0.1, 0.1],
    seed=42
)

# Peek at one training batch
batch = next(iter(loaders['train']))
print(batch)
print("x:", batch.x.shape, "edge_index:", batch.edge_index.shape, "y:", batch.y.shape)

# Now you can pass `loaders` + `dataset` to your existing training code.


DataBatch(x=[936, 34], edge_index=[2, 1998], edge_attr=[1998, 9], y=[32], batch=[936], ptr=[33])
x: torch.Size([936, 34]) edge_index: torch.Size([2, 1998]) y: torch.Size([32])


In [16]:
batch.y

tensor([1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 0])

In [17]:
import torch
from torch_geometric.utils import is_undirected

b = next(iter(loaders['train']))
print("batch size:", b.y.numel())
print("feat_dim:", b.x.size(1))
print("edges:", b.edge_index.size(1), "undirected?", is_undirected(b.edge_index))
ea = getattr(b, "edge_attr", None)
print("edge_attr tensor?", isinstance(ea, torch.Tensor))
if isinstance(ea, torch.Tensor):
    print("edge_attr shape:", ea.shape)  # should be [num_directed_edges, 9]


batch size: 32
feat_dim: 34
edges: 2030 undirected? True
edge_attr tensor? True
edge_attr shape: torch.Size([2030, 9])
