# Creation of E/Z dataset

## Import section

In [1]:
import os
import pickle
from builtins import int

## Loading of dataset.

choice: rs dataset as most datasets with only one double bond with e/z label

In [2]:
mode = 'val'
with open(os.path.join("hyperoptimization/src/rs/raw/", f"{mode}.pickle"), 'rb') as f:
    df = pickle.load(f)

In [3]:
df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
0,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000002DC27A...,R,0
1,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000002DC2EE...,R,0
2,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000002DC2EE...,R,0
3,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000002DC2EE...,R,0
4,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000002DC2EE...,R,0
...,...,...,...,...,...
70094,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000002DC659...,R,0
70095,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000002DC659...,R,0
70096,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000002DC659...,R,0
70097,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000002DC659...,R,0


## Filtering only molecules with one double bond that is a stereocenter (and has 4 constituents)

In [4]:
def filter_has_one_marked_stereo(df_entry, smiles_name: str = 'ID') -> bool:
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit import Chem

    # unpack passed object
    idx, entry = df_entry

    # define allowed stereo types
    allowed_stereo = [
        Chem.rdchem.BondStereo.STEREOZ,
        Chem.rdchem.BondStereo.STEREOE
    ]

    # extract smiles string
    smiles = entry[smiles_name]

    # get molecule
    molecule = smiles_to_3d_mol(
        smiles=smiles,
        max_number_of_atoms=100,
        max_number_of_attempts=100
    )
    if molecule is None:
        return idx, df_entry, -1

    # determine whether molecule has exactly one double bond with marked E/Z
    from rdkit import Chem
    bonds = [
        bond
        for bond in molecule.GetBonds()
        if (bond.GetBondType() == Chem.rdchem.BondType.DOUBLE) and
           (bond.GetStereo() in allowed_stereo) and
           (bond.GetBeginAtom().GetDegree() == 3) and
           (bond.GetEndAtom().GetDegree() == 3) # reduces to 1/6th of items with previous ones
    ]

    #if len(bonds) == 1:
    #    entry['EZ_label'] = "E" if bonds[0].GetStereo() == Chem.rdchem.BondStereo.STEREOE else "Z"
    #    entry['EZ_label_binary'] = 1 if bonds[0].GetStereo() == Chem.rdchem.BondStereo.STEREOE else 0

    return idx, entry, len(bonds)

In [7]:
from tqdm import tqdm
from multiprocess.pool import Pool

with Pool(processes=os.cpu_count()-4) as p:
    df_collection = list(p.imap(
        filter_has_one_marked_stereo,
        tqdm(df.iterrows(), total=len(df))
    ))

100%|██████████| 70099/70099 [06:12<00:00, 188.38it/s]


In [8]:
df_collection

[(0,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo                              BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC713...
  RS_label                                                                     R
  RS_label_binary                                                              0
  Name: 0, dtype: object,
  0),
 (1,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo                              BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC713...
  RS_label                                                                     R
  RS_label_binary                                                              0
  Name: 1, dtype: object,
  0),
 (2,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo             

## Filter according to returned value, need exactly one and build new dataframe

In [9]:
filtered_list = list(filter(
    lambda x: x[2] == 1,
    df_collection
))

In [10]:
len(filtered_list)

3970

In [11]:
filtered_list = sorted(filtered_list, key=lambda x: x[0])

In [12]:
filtered_list

[(506,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC717...
  RS_label                                                                     S
  RS_label_binary                                                              1
  Name: 506, dtype: object,
  1),
 (507,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC717...
  RS_label                                                                     S
  RS_label_binary                                                              1
  Name: 507, dtype: object,
  1),
 (508,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo   

In [13]:
# remove first and last element from tuples
filtered_list = [
    elem[1]
    for elem in filtered_list
]

In [14]:
filtered_list

[ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC717...
 RS_label                                                                     S
 RS_label_binary                                                              1
 Name: 506, dtype: object,
 ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000002DC717...
 RS_label                                                                     S
 RS_label_binary                                                              1
 Name: 507, dtype: object,
 ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)

In [15]:
import pandas as pd

In [16]:
type(filtered_list[0])

pandas.core.series.Series

In [17]:
filtered_list[0][1]

  filtered_list[0][1]


'C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)cc1'

In [18]:
new_df = pd.DataFrame(filtered_list)
new_df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
506,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
507,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
508,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
509,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
510,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
...,...,...,...,...,...
69850,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
69851,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
69852,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
69853,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0


In [19]:
new_df.reset_index(drop='index', inplace=True)

## Label generation is integrated into processing

In [20]:
new_df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
0,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
1,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
3,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
4,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000002DC717...,S,1
...,...,...,...,...,...
3965,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
3966,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
3967,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0
3968,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000002DC239...,R,0


## Remove duplicate elements hailing from @/@@ i.e. R/S

In [21]:
new_df.groupby('SMILES_nostereo').count()

Unnamed: 0_level_0,ID,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
SMILES_nostereo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)cc1,16,16,16,16
C#CCOc1ccc(C=c2sc3n(c2=O)C(c2cccc(OC)c2OCC)C(C(=O)OCCOC)=C(C)N=3)cc1,19,19,19,19
C#CCn1cc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)c2ccccc21,20,20,20,20
C1=CC=C(C2C=C(c3cccc(-c4ccc5c6c(c7ccccc7c5c4)N=CC=CCC6)c3)C=C(c3ccccc3)N2)CC=C1,10,10,10,10
C=C(C)C(=C(N)N=C(C)C(=C)c1ccc(F)cc1NCc1ccccc1F)C(C)(C=O)c1cnn(CC(C)(C)C(=O)O)c1,10,10,10,10
...,...,...,...,...
OC1(C(F)(F)F)CC(C=Cc2cccs2)=NO1,10,10,10,10
OCC=CC1=CC2=C(CC1)C(NCCc1c[nH]c3ccccc13)CCC2,10,10,10,10
[N-]=[N+]=NC(C=Cc1ccc(O)c(O)c1)[N+](=O)[O-],10,10,10,10
[N-]=[N+]=NC1=Cc2ccccc2C1CC=CCBr,10,10,10,10


In [22]:
new_df.groupby('SMILES_nostereo')['RS_label_binary'].sum()

SMILES_nostereo
C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)cc1              8
C#CCOc1ccc(C=c2sc3n(c2=O)C(c2cccc(OC)c2OCC)C(C(=O)OCCOC)=C(C)N=3)cc1                9
C#CCn1cc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)c2ccccc21         10
C1=CC=C(C2C=C(c3cccc(-c4ccc5c6c(c7ccccc7c5c4)N=CC=CCC6)c3)C=C(c3ccccc3)N2)CC=C1     2
C=C(C)C(=C(N)N=C(C)C(=C)c1ccc(F)cc1NCc1ccccc1F)C(C)(C=O)c1cnn(CC(C)(C)C(=O)O)c1     5
                                                                                   ..
OC1(C(F)(F)F)CC(C=Cc2cccs2)=NO1                                                     5
OCC=CC1=CC2=C(CC1)C(NCCc1c[nH]c3ccccc13)CCC2                                        5
[N-]=[N+]=NC(C=Cc1ccc(O)c(O)c1)[N+](=O)[O-]                                         4
[N-]=[N+]=NC1=Cc2ccccc2C1CC=CCBr                                                    5
[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1                                      5
Name: RS_label_binary, Length: 285, dt

In [23]:
new_df['EZ_label_binary'].sum()/len(new_df)

KeyError: 'EZ_label_binary'

In [None]:
from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol

smiles = new_df.iloc[66]['ID']

test_mol = smiles_to_3d_mol(
    smiles=smiles,
    max_number_of_atoms=100,
    max_number_of_attempts=100
)

In [None]:
test_mol

In [None]:
smiles

In [None]:
# transform all to one sign
smiles = smiles.replace("\\", "/")
smiles

In [None]:
# split string
split_smiles = smiles.split("/")
split_smiles

In [None]:
assert len(split_smiles) == 3
# produce all variations
new_smiles = [
    split_smiles[0] + "/" + split_smiles[1] + "/" + split_smiles[2],
    split_smiles[0] + "/" + split_smiles[1] + "\\" + split_smiles[2],
    split_smiles[0] + "\\" + split_smiles[1] + "/" + split_smiles[2],
    split_smiles[0] + "\\" + split_smiles[1] + "\\" + split_smiles[2],
]
new_smiles

In [None]:
# init new smiles
new_smiles = [split_smiles[0]]

for i in range(1, len(split_smiles)):
    # duplicate elements
    new_smiles = new_smiles * 2

    # append new content
    new_smiles[:int(len(new_smiles)/2)] = [
        elem + "/" + split_smiles[i]
        for elem in new_smiles[:int(len(new_smiles)/2)]
    ]
    new_smiles[int(len(new_smiles)/2):] = [
        elem + "\\" + split_smiles[i]
        for elem in new_smiles[int(len(new_smiles)/2):]
    ]

In [None]:
new_smiles

In [None]:
# problem more than 2 such signs in molecules - options
# 1. ignore and add even more elements
# 2. modify molecule to set bond type and generate smiles

In [None]:
from rdkit import Chem

Chem.MolToSmiles(test_mol)

In [26]:
def create_all_ez_isomers(df_entry):

    idx, entry = df_entry

    # fetch smiles string
    smiles = entry.ID

    # transform all to one sign
    smiles = smiles.replace("\\", "/")

    # split string
    split_smiles = smiles.split("/")

    # init new smiles
    new_smiles = [split_smiles[0]]

    for i in range(1, len(split_smiles)):
        # duplicate elements
        new_smiles = new_smiles * 2

        # append new content
        new_smiles[:int(len(new_smiles)/2)] = [
            elem + "/" + split_smiles[i]
            for elem in new_smiles[:int(len(new_smiles)/2)]
        ]
        new_smiles[int(len(new_smiles)/2):] = [
            elem + "\\" + split_smiles[i]
            for elem in new_smiles[int(len(new_smiles)/2):]
        ]

    return pd.DataFrame({
        "ID": new_smiles,
        "SMILES_nostereo": [entry['SMILES_nostereo']]*len(new_smiles)
    })

In [27]:
df_collection2 = pd.concat(list(map(create_all_ez_isomers, tqdm(new_df.iterrows(), total=len(new_df)))))
df_collection2.reset_index(inplace=True, drop='index')
df_collection2

100%|██████████| 3970/3970 [00:00<00:00, 10621.89it/s]


Unnamed: 0,ID,SMILES_nostereo
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
4,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
...,...,...
16515,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16516,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16517,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16518,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1


In [28]:
# remove duplicates
df_collection2.drop_duplicates(subset=['ID', 'SMILES_nostereo'], inplace=True)
df_collection2

Unnamed: 0,ID,SMILES_nostereo
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
32,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@H](c2ccc(OC)cc2)...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
...,...,...
16483,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@@H]2C(=O)c2ccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16500,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16501,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16502,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1


In [43]:
# label molecules
def generate_labels(df_entry):
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit import Chem

    idx, entry = df_entry

    # render molecule
    molecule = smiles_to_3d_mol(entry.ID, max_number_of_atoms=100, max_number_of_attempts=100)

    if molecule is None:
        return None

    # iterate over label until the E/Z double bond is found.
    for bond in molecule.GetBonds():
        if bond.GetBondType() == Chem.rdchem.BondType.DOUBLE:
            if bond.GetStereo() == Chem.rdchem.BondStereo.STEREOE:
                entry['EZ_label'] = "E"
                entry['EZ_label_binary'] = 1
            elif bond.GetStereo() == Chem.rdchem.BondStereo.STEREOZ:
                entry['EZ_label'] = "Z"
                entry['EZ_label_binary'] = 0
            else:
                continue

    return entry

In [46]:
with Pool(processes=os.cpu_count()) as p:
    df_collection3 = list(p.imap(generate_labels, tqdm(df_collection2.iterrows(), total=len(df_collection2))))
df_collection3

100%|██████████| 2404/2404 [01:01<00:00, 39.28it/s] 


[ID                 C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           E
 EZ_label_binary                                                    1
 Name: 0, dtype: object,
 ID                 C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           Z
 EZ_label_binary                                                    0
 Name: 1, dtype: object,
 ID                 C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           Z
 EZ_label_binary                                                    0
 Name: 2, dtype: object,
 ID                 C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostere

In [49]:
df_collection3 = pd.DataFrame([elem for elem in df_collection3 if elem is not None])
df_collection3

Unnamed: 0,ID,SMILES_nostereo,EZ_label,EZ_label_binary
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,Z,0
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,Z,0
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
32,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@H](c2ccc(OC)cc2)...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
...,...,...,...,...
16483,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@@H]2C(=O)c2ccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,Z,0
16500,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,Z,0
16501,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,E,1
16502,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,E,1


In [50]:
df_collection3.EZ_label_binary.sum()

1188

In [1]:
from ptgnn.dataset.ez_dataset import EZDataset

In [2]:
val_ds = EZDataset(root="hyperoptimization/src/ez", split='val')

Processing...
Split: val: 100%|██████████| 2374/2374 [00:36<00:00, 64.87it/s] 
Done!


In [5]:
test_ds = EZDataset(root="hyperoptimization/src/ez", split='test')

Starting download and dataset generation for split: test


test: filter: 100%|██████████| 11680/11680 [00:57<00:00, 204.51it/s]
test: isomer creation: 100%|██████████| 606/606 [00:08<00:00, 73.29it/s] 
test: label generation: 100%|██████████| 2640/2640 [00:28<00:00, 93.52it/s] 


Done!


Processing...
Split: test: 100%|██████████| 2640/2640 [00:39<00:00, 66.26it/s] 
Done!


In [6]:
train_ds = EZDataset(root="hyperoptimization/src/ez", split='train')

Starting download and dataset generation for split: train


train: filter: 100%|██████████| 55084/55084 [04:22<00:00, 209.84it/s]
train: isomer creation: 100%|██████████| 2804/2804 [00:11<00:00, 242.96it/s]
train: label generation: 100%|██████████| 11808/11808 [02:03<00:00, 95.71it/s] 


Done!


Processing...
Split: train: 100%|██████████| 11784/11784 [02:23<00:00, 82.23it/s] 
Done!


In [3]:
for elem in val_ds:
    display(elem)
    break

Data(x=[140, 118], edge_index=[2, 370], edge_attr=[370, 80], pos=[140, 6], parallel_node_index=[140], circle_index=[140], y=[1])

In [4]:
elem.y

tensor([1])