# Creation of test dataset for Multi-center tetrahedral prediction

## Import section

In [1]:
import os

import pandas as pd
from tqdm import tqdm
from multiprocess.pool import Pool

## Load initial datset

Which dataset to choose: tox, has most such elements. (could also fuze datasets but should be enough for miniature dataset, merging datasets create other problems)

In [2]:
df = pd.read_csv("hyperoptimization/src/tox21/raw/tox21.csv.gz")

In [3]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


## Filter out less than 2 stereo center molecules

In [4]:
def filter_mc_tetra_mol(
        df_entry,
        smiles_name: str = "smiles"
):
    """
    Returns true if molecule has multiple chiral centers.

    :param df_entry: pandas row object
    :param smiles_name: name of the columns under which the stereochemical SMILES can be queried from the
        df_entry
    :return: whether or not the molecule has multiple chiral centers (plus the idx and the entry before that)
    """
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit.Chem import AllChem

    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry[smiles_name]

    # get molecule
    molecule = smiles_to_3d_mol(
        smiles=smiles,
        max_number_of_atoms=100,
        max_number_of_attempts=100
    )

    # if molecule cannot be rendered (i.e. is None) return appropriate information
    if molecule is None:
        return idx, df_entry, False

    # does molecule have multiple marked chiral centers
    return idx, entry, len(AllChem.FindMolChiralCenters(molecule)) == 2

In [5]:
with Pool(processes=os.cpu_count()) as p:
    df_collection = list(p.map(
        filter_mc_tetra_mol,
        tqdm(df.iterrows(), total=len(df))
    ))

100%|██████████| 7831/7831 [00:26<00:00, 293.06it/s] 


In [6]:
df_collection = pd.DataFrame([
    elem[1]
    for elem in df_collection
    if elem[2] == 1
]).reset_index(drop='index')

## Produce all possible combinations of @ or @@

In [7]:
def worker_create_all_combinations(
        df_entry,
        smiles_name: str = 'smiles'
):
    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry[smiles_name]

    # replace all double @ with normal @... wait but different forms? urgh the smiles may be different...
    # what in the first place is the label?
    # if I make it such that EXACTLY 2, then u/l prediction is possible. or distribution of max(|R|, |S|) in general

    # get clearned smiles version with only one @
    cleaned_smiles = smiles.replace("@@", "@")

    # split it
    split_smiles = cleaned_smiles.split("@")
    if len(split_smiles) != 3:
        print(f"SMILES {smiles} dropped as too many @ in smiles")
        return pd.DataFrame()

    # it is basically guaranteed that this results in 3 parts as there must be 2 centers in there
    return pd.DataFrame({
        "smiles": [
            split_smiles[0] + "@" + split_smiles[1] + "@" + split_smiles[2],
            split_smiles[0] + "@" + split_smiles[1] + "@@" + split_smiles[2],
            split_smiles[0] + "@@" + split_smiles[1] + "@" + split_smiles[2],
            split_smiles[0] + "@@" + split_smiles[1] + "@@" + split_smiles[2],
        ],
    })

In [8]:
df_collection

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,0.0,0.0,0.0,,0.0,,,,TOX22631,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,TOX25538,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX25530,CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(S(N)(=O)=O)cc21
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX28633,CCCCCCCCCCCCCCCC(=O)O[C@@H]1CC(C)=C(/C=C/C(C)=...
4,0.0,0.0,1.0,,1.0,0.0,0.0,0.0,1.0,0.0,,0.0,TOX27264,Oc1ccc2c(c1)OC[C@@H](N1CCC(O)(c3ccc(F)cc3)CC1)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,1.0,0.0,TOX28548,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...
223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX25342,CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](N)[C@H]2SC1
224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX4721,CC1(C)[C@@H]2CC[C@@]1(C)C(=O)C2
225,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,,,,TOX27852,OC[C@H](Cc1cccc(O)c1)[C@H](CO)Cc1cccc(O)c1


In [9]:
df_collection = pd.concat(
    list(map(
        worker_create_all_combinations,
        tqdm(df_collection.iterrows(), total=len(df_collection))
    ))
).reset_index(drop='index')

100%|██████████| 227/227 [00:00<00:00, 16215.18it/s]

SMILES CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nnnn3C)CS[C@H]12)c1csc(N)n1.CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nnnn3C)CS[C@H]12)c1csc(N)n1 dropped as too many @ in smiles
SMILES COc1ccc(C[C@@H](C)NC[C@H](O)c2ccc(O)c(NC=O)c2)cc1.COc1ccc(C[C@@H](C)NC[C@H](O)c2ccc(O)c(NC=O)c2)cc1 dropped as too many @ in smiles
SMILES O=C1O[C@H]([C@@H](O)CO)C([O-])=C1O.O=C1O[C@H]([C@@H](O)CO)C([O-])=C1O dropped as too many @ in smiles
SMILES C[C@@H]1CN([C@H]2CC[C@](C#N)(c3ccc(F)cc3)CC2)CC[C@]1(C(=O)O)c1ccccc1 dropped as too many @ in smiles
SMILES CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F)cc2)n1CC[C@@H](O)C[C@@H](O)CC(=O)[O-].CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F)cc2)n1CC[C@@H](O)C[C@@H](O)CC(=O)[O-] dropped as too many @ in smiles
SMILES COc1ccc(C[C@H](C)NC[C@@H](O)c2ccc(O)c(NC=O)c2)cc1.COc1ccc(C[C@H](C)NC[C@@H](O)c2ccc(O)c(NC=O)c2)cc1 dropped as too many @ in smiles





In [10]:
df_collection

Unnamed: 0,smiles
0,CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
2,CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
3,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
4,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...
...,...
879,OC[C@@H](Cc1cccc(O)c1)[C@@H](CO)Cc1cccc(O)c1
880,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c(...
881,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)c...
882,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c...


In [11]:
df_collection.drop_duplicates(subset=['smiles'], inplace=True)

In [12]:
df_collection

Unnamed: 0,smiles
0,CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
2,CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
3,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
4,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...
...,...
879,OC[C@@H](Cc1cccc(O)c1)[C@@H](CO)Cc1cccc(O)c1
880,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c(...
881,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)c...
882,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c...


In [13]:
def assign_lu_label(df_entry):
    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry['smiles']

    # render molecule
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    molecule = smiles_to_3d_mol(smiles, max_number_of_atoms=100, max_number_of_attempts=100)

    if molecule is None:
        return None

    # create the label, thus first fetch stereo centers
    from rdkit import Chem
    center_one, center_two = Chem.FindMolChiralCenters(molecule)

    binary_label = center_one[1] == center_two[1]

    entry['MC_label'] = "L" if binary_label else "U"
    entry['MC_label_binary'] = 1 if binary_label else 0

    return entry

In [14]:
with Pool(processes=os.cpu_count()) as p:
    df_collection = list(p.map(
        assign_lu_label,
        tqdm(df_collection.iterrows(), total=len(df_collection))
    ))

100%|██████████| 852/852 [00:09<00:00, 92.68it/s]  


In [15]:
df_collection

[smiles             CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
 MC_label                                                   U
 MC_label_binary                                        False
 Name: 0, dtype: object,
 smiles             CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
 MC_label                                                    L
 MC_label_binary                                          True
 Name: 1, dtype: object,
 smiles             CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
 MC_label                                                    L
 MC_label_binary                                          True
 Name: 2, dtype: object,
 smiles             CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
 MC_label                                                     U
 MC_label_binary                                          False
 Name: 3, dtype: object,
 smiles             CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...
 MC_label                                                           U
 MC_

In [16]:
df_collection = pd.DataFrame([elem for elem in df_collection if elem is not None])
df_collection

Unnamed: 0,smiles,MC_label,MC_label_binary
0,CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21,U,False
1,CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21,L,True
2,CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21,L,True
3,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21,U,False
4,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...,U,False
...,...,...,...
879,OC[C@@H](Cc1cccc(O)c1)[C@@H](CO)Cc1cccc(O)c1,L,True
880,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c(...,L,True
881,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)c...,U,False
882,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c...,U,False


In [1]:
from ptgnn.dataset.mc_dataset import MCDataset

In [2]:
train_mc = MCDataset(**{'transformation_parameters': {'tetrahedral_chiral': False, 'multi_stereo_center_dia': True, 'k': 3}, 'type': 'mc', 'mask_chiral_tags': True, 'graph_mode': 'edge', 'transformation_mode': 'permutation_tree', 'root': 'D:\\DATEN\\Masterarbeit_PTGNN\\notebooks\\hyperoptimization\\src\\mc'})
train_mc

Processing...
100%|██████████| 5481/5481 [00:30<00:00, 180.26it/s]
100%|██████████| 158/158 [00:00<00:00, 2693.41it/s]


SMILES COc1ccc(C[C@@H](C)NC[C@H](O)c2ccc(O)c(NC=O)c2)cc1.COc1ccc(C[C@@H](C)NC[C@H](O)c2ccc(O)c(NC=O)c2)cc1 dropped as too many @ in smiles
SMILES O=C1O[C@H]([C@@H](O)CO)C([O-])=C1O.O=C1O[C@H]([C@@H](O)CO)C([O-])=C1O dropped as too many @ in smiles
SMILES CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F)cc2)n1CC[C@@H](O)C[C@@H](O)CC(=O)[O-].CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F)cc2)n1CC[C@@H](O)C[C@@H](O)CC(=O)[O-] dropped as too many @ in smiles
SMILES COc1ccc(C[C@H](C)NC[C@@H](O)c2ccc(O)c(NC=O)c2)cc1.COc1ccc(C[C@H](C)NC[C@@H](O)c2ccc(O)c(NC=O)c2)cc1 dropped as too many @ in smiles


100%|██████████| 596/596 [00:05<00:00, 114.32it/s]
100%|██████████| 578/578 [02:45<00:00,  3.48it/s] 
Split: train: 100%|██████████| 578/578 [00:04<00:00, 143.34it/s] 
Postprocessing matrices: 100%|██████████| 578/578 [00:00<00:00, 289020.95it/s]
100%|██████████| 783/783 [00:08<00:00, 88.79it/s] 
100%|██████████| 22/22 [00:00<00:00, 6281.89it/s]
100%|██████████| 88/88 [00:00<00:00, 254.62it/s]
100%|██████████| 88/88 [00:14<00:00,  6.20it/s]
Split: val: 100%|██████████| 88/88 [00:00<00:00, 142.05it/s]
Postprocessing matrices: 100%|██████████| 88/88 [00:00<00:00, 161460.52it/s]
100%|██████████| 1567/1567 [00:11<00:00, 133.86it/s]
100%|██████████| 47/47 [00:00<00:00, 6714.77it/s]


SMILES CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nnnn3C)CS[C@H]12)c1csc(N)n1.CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)O)=C(CSc3nnnn3C)CS[C@H]12)c1csc(N)n1 dropped as too many @ in smiles
SMILES C[C@@H]1CN([C@H]2CC[C@](C#N)(c3ccc(F)cc3)CC2)CC[C@]1(C(=O)O)c1ccccc1 dropped as too many @ in smiles


100%|██████████| 172/172 [00:00<00:00, 237.24it/s]
 66%|██████▋   | 111/167 [00:12<00:04, 11.40it/s][13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
 68%|██████▊   | 113/167 [00:12<00:04, 12.32it/s][13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for atom: 3
[13:39:27] UFFTYPER: Unrecognized atom type: Pt+2 (3)
[13:39:27] UFFTYPER: Unrecognized hybridization for a

MCDataset(578)

In [7]:
for elem in train_mc:
    display(elem, elem.y)
    break

Data(x=[72, 118], edge_index=[2, 198], edge_attr=[198, 80], pos=[72, 6], parallel_node_index=[72], circle_index=[72], y=[1])

tensor([0])