# Creation of test dataset for Multi-center tetrahedral prediction

## Import section

In [1]:
import os

import pandas as pd
from tqdm import tqdm
from multiprocess.pool import Pool

## Load initial datset

Which dataset to choose: tox, has most such elements. (could also fuze datasets but should be enough for miniature dataset, merging datasets create other problems)

In [2]:
df = pd.read_csv("hyperoptimization/src/tox21/raw/tox21.csv.gz")

In [3]:
df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


## Filter out less than 2 stereo center molecules

In [4]:
def filter_mc_tetra_mol(
        df_entry,
        smiles_name: str = "smiles"
):
    """
    Returns true if molecule has multiple chiral centers.

    :param df_entry: pandas row object
    :param smiles_name: name of the columns under which the stereochemical SMILES can be queried from the
        df_entry
    :return: whether or not the molecule has multiple chiral centers (plus the idx and the entry before that)
    """
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit.Chem import AllChem

    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry[smiles_name]

    # get molecule
    molecule = smiles_to_3d_mol(
        smiles=smiles,
        max_number_of_atoms=100,
        max_number_of_attempts=100
    )

    # if molecule cannot be rendered (i.e. is None) return appropriate information
    if molecule is None:
        return idx, df_entry, False

    # does molecule have multiple marked chiral centers
    return idx, entry, len(AllChem.FindMolChiralCenters(molecule)) == 2

In [5]:
with Pool(processes=os.cpu_count()) as p:
    df_collection = list(p.map(
        filter_mc_tetra_mol,
        tqdm(df.iterrows(), total=len(df))
    ))

100%|██████████| 7831/7831 [00:36<00:00, 216.50it/s] 


In [6]:
df_collection = pd.DataFrame([
    elem[1]
    for elem in df_collection
    if elem[2] == 1
]).reset_index(drop='index')

## Produce all possible combinations of @ or @@

In [7]:
def worker_create_all_combinations(
        df_entry,
        smiles_name: str = 'smiles'
):
    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry[smiles_name]

    # replace all double @ with normal @... wait but different forms? urgh the smiles may be different...
    # what in the first place is the label?
    # if I make it such that EXACTLY 2, then u/l prediction is possible. or distribution of max(|R|, |S|) in general

    # get clearned smiles version with only one @
    cleaned_smiles = smiles.replace("@@", "@")

    # split it
    split_smiles = cleaned_smiles.split("@")

    # it is basically guaranteed that this results in 3 parts as there must be 2 centers in there
    return pd.DataFrame({
        "smiles": [
            split_smiles[0] + "@" + split_smiles[1] + "@" + split_smiles[2],
            split_smiles[0] + "@" + split_smiles[1] + "@@" + split_smiles[2],
            split_smiles[0] + "@@" + split_smiles[1] + "@" + split_smiles[2],
            split_smiles[0] + "@@" + split_smiles[1] + "@@" + split_smiles[2],
        ],
    })

In [8]:
df_collection

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,0.0,0.0,0.0,,0.0,,,,TOX22631,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,TOX25538,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=...
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX25530,CCN[C@H]1C[C@H](C)S(=O)(=O)c2sc(S(N)(=O)=O)cc21
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX28633,CCCCCCCCCCCCCCCC(=O)O[C@@H]1CC(C)=C(/C=C/C(C)=...
4,0.0,0.0,1.0,,1.0,0.0,0.0,0.0,1.0,0.0,,0.0,TOX27264,Oc1ccc2c(c1)OC[C@@H](N1CCC(O)(c3ccc(F)cc3)CC1)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,1.0,0.0,TOX28548,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...
223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX25342,CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](N)[C@H]2SC1
224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX4721,CC1(C)[C@@H]2CC[C@@]1(C)C(=O)C2
225,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,,,,TOX27852,OC[C@H](Cc1cccc(O)c1)[C@H](CO)Cc1cccc(O)c1


In [9]:
df_collection = pd.concat(
    list(map(
        worker_create_all_combinations,
        tqdm(df_collection.iterrows(), total=len(df_collection))
    ))
)

100%|██████████| 227/227 [00:00<00:00, 11947.93it/s]


In [10]:
df_collection

Unnamed: 0,smiles
0,CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
2,CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
3,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
0,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...
...,...
3,OC[C@@H](Cc1cccc(O)c1)[C@@H](CO)Cc1cccc(O)c1
0,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c(...
1,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)c...
2,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c...


In [11]:
df_collection.drop_duplicates(subset=['smiles'], inplace=True)

In [12]:
df_collection

Unnamed: 0,smiles
0,CN[C@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
1,CN[C@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
2,CN[C@@H]1C[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
3,CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
0,CN(C)CCn1nnnc1SCC1=C(C(=O)O)N2C(=O)[C@H](NC(=O...
...,...
3,OC[C@@H](Cc1cccc(O)c1)[C@@H](CO)Cc1cccc(O)c1
0,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c(...
1,O=C(O[C@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)c...
2,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@H]1c1cc(O)c(O)c...


In [None]:
def assign_lu_label(df_entry):
    # unpack passed object
    idx, entry = df_entry

    # extract smiles string
    smiles = entry['smiles']