# Creation of E/Z dataset

## Import section

In [1]:
import os
import pickle
from builtins import int

## Loading of dataset.

choice: rs dataset as most datasets with only one double bond with e/z label

In [2]:
mode = 'val'
with open(os.path.join("hyperoptimization/src/rs/raw/", f"{mode}.pickle"), 'rb') as f:
    df = pickle.load(f)

In [3]:
df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
0,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000001C12B5...,R,0
1,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000001C12A8...,R,0
2,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000001C12A8...,R,0
3,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000001C12A8...,R,0
4,Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1,BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1,<rdkit.Chem.rdchem.Mol object at 0x000001C12A8...,R,0
...,...,...,...,...,...
70094,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000001C1613...,R,0
70095,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000001C1613...,R,0
70096,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000001C1613...,R,0
70097,c1csc(CNCC[C@H]2CCCNC2)n1,c1csc(CNCCC2CCCNC2)n1,<rdkit.Chem.rdchem.Mol object at 0x000001C1613...,R,0


## Filtering only molecules with one double bond that is a stereocenter (and has 4 constituents)

In [4]:
def filter_has_one_marked_stereo(df_entry, smiles_name: str = 'ID') -> bool:
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit import Chem

    # unpack passed object
    idx, entry = df_entry

    # define allowed stereo types
    allowed_stereo = [
        Chem.rdchem.BondStereo.STEREOZ,
        Chem.rdchem.BondStereo.STEREOE
    ]

    # extract smiles string
    smiles = entry[smiles_name]

    # get molecule
    molecule = smiles_to_3d_mol(
        smiles=smiles,
        max_number_of_atoms=100,
        max_number_of_attempts=100
    )
    if molecule is None:
        return idx, df_entry, -1

    # determine whether molecule has exactly one double bond with marked E/Z
    from rdkit import Chem
    bonds = [
        bond
        for bond in molecule.GetBonds()
        if (bond.GetBondType() == Chem.rdchem.BondType.DOUBLE) and
           (bond.GetStereo() in allowed_stereo) and
           (bond.GetBeginAtom().GetDegree() == 3) and
           (bond.GetEndAtom().GetDegree() == 3) # reduces to 1/6th of items with previous ones
    ]

    #if len(bonds) == 1:
    #    entry['EZ_label'] = "E" if bonds[0].GetStereo() == Chem.rdchem.BondStereo.STEREOE else "Z"
    #    entry['EZ_label_binary'] = 1 if bonds[0].GetStereo() == Chem.rdchem.BondStereo.STEREOE else 0

    return idx, entry, len(bonds)

In [5]:
from tqdm import tqdm
from multiprocess.pool import Pool

with Pool(processes=os.cpu_count()-4) as p:
    df_collection = list(p.imap(
        filter_has_one_marked_stereo,
        tqdm(df.iterrows(), total=len(df))
    ))

100%|██████████| 70099/70099 [05:58<00:00, 195.69it/s]


In [6]:
df_collection

[(0,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo                              BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C12F8...
  RS_label                                                                     R
  RS_label_binary                                                              0
  Name: 0, dtype: object,
  0),
 (1,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo                              BrC1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C12F8...
  RS_label                                                                     R
  RS_label_binary                                                              0
  Name: 1, dtype: object,
  0),
 (2,
  ID                                       Br[C@@]1(c2cccc(-c3ccccc3)c2)C=CC=CC1
  SMILES_nostereo             

## Filter according to returned value, need exactly one and build new dataframe

In [7]:
filtered_list = list(filter(
    lambda x: x[2] == 1,
    df_collection
))

In [8]:
len(filtered_list)

3970

In [9]:
filtered_list = sorted(filtered_list, key=lambda x: x[0])

In [10]:
filtered_list

[(506,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C15E2...
  RS_label                                                                     S
  RS_label_binary                                                              1
  Name: 506, dtype: object,
  1),
 (507,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
  rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C138A...
  RS_label                                                                     S
  RS_label_binary                                                              1
  Name: 507, dtype: object,
  1),
 (508,
  ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
  SMILES_nostereo   

In [11]:
# remove first and last element from tuples
filtered_list = [
    elem[1]
    for elem in filtered_list
]

In [12]:
filtered_list

[ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C15E2...
 RS_label                                                                     S
 RS_label_binary                                                              1
 Name: 506, dtype: object,
 ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 rdkit_mol_cistrans_stereo    <rdkit.Chem.rdchem.Mol object at 0x000001C138A...
 RS_label                                                                     S
 RS_label_binary                                                              1
 Name: 507, dtype: object,
 ID                           C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo              C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)

In [13]:
import pandas as pd

In [14]:
type(filtered_list[0])

pandas.core.series.Series

In [15]:
filtered_list[0][1]

  filtered_list[0][1]


'C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O)OCC)=C(c2ccccc2)N=3)cc1'

In [16]:
new_df = pd.DataFrame(filtered_list)
new_df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
506,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C15E2...,S,1
507,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C138A...,S,1
508,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C15E2...,S,1
509,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C13FC...,S,1
510,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C13FC...,S,1
...,...,...,...,...,...
69850,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
69851,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
69852,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
69853,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0


In [17]:
new_df.reset_index(drop='index', inplace=True)

## Label generation is integrated into processing

In [18]:
new_df

Unnamed: 0,ID,SMILES_nostereo,rdkit_mol_cistrans_stereo,RS_label,RS_label_binary
0,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C15E2...,S,1
1,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C138A...,S,1
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C15E2...,S,1
3,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C13FC...,S,1
4,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,<rdkit.Chem.rdchem.Mol object at 0x000001C13FC...,S,1
...,...,...,...,...,...
3965,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
3966,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
3967,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0
3968,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x000001C1220...,R,0


## Remove duplicate elements hailing from @/@@ i.e. R/S

In [19]:
def create_all_ez_isomers(df_entry):

    idx, entry = df_entry

    # fetch smiles string
    smiles = entry.ID

    # transform all to one sign
    smiles = smiles.replace("\\", "/")

    # split string
    split_smiles = smiles.split("/")

    # init new smiles
    new_smiles = [split_smiles[0]]

    for i in range(1, len(split_smiles)):
        # duplicate elements
        new_smiles = new_smiles * 2

        # append new content
        new_smiles[:int(len(new_smiles)/2)] = [
            elem + "/" + split_smiles[i]
            for elem in new_smiles[:int(len(new_smiles)/2)]
        ]
        new_smiles[int(len(new_smiles)/2):] = [
            elem + "\\" + split_smiles[i]
            for elem in new_smiles[int(len(new_smiles)/2):]
        ]

    return pd.DataFrame({
        "ID": new_smiles,
        "SMILES_nostereo": [entry['SMILES_nostereo']]*len(new_smiles)
    })

In [20]:
df_collection2 = pd.concat(list(map(create_all_ez_isomers, tqdm(new_df.iterrows(), total=len(new_df)))))
df_collection2.reset_index(inplace=True, drop='index')
df_collection2

100%|██████████| 3970/3970 [00:00<00:00, 12743.22it/s]


Unnamed: 0,ID,SMILES_nostereo
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
4,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
...,...,...
16515,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16516,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16517,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16518,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1


In [21]:
# remove duplicates
df_collection2.drop_duplicates(subset=['ID', 'SMILES_nostereo'], inplace=True)
df_collection2

Unnamed: 0,ID,SMILES_nostereo
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
32,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@H](c2ccc(OC)cc2)...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
...,...,...
16483,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@@H]2C(=O)c2ccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16500,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16501,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1
16502,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1


In [22]:
# label molecules
def generate_labels(df_entry):
    from ptgnn.features.chienn.molecule3d import smiles_to_3d_mol
    from rdkit import Chem

    idx, entry = df_entry

    # render molecule
    molecule = smiles_to_3d_mol(entry.ID, max_number_of_atoms=100, max_number_of_attempts=100)

    if molecule is None:
        return None

    # iterate over label until the E/Z double bond is found.
    for bond in molecule.GetBonds():
        if bond.GetBondType() == Chem.rdchem.BondType.DOUBLE:
            if bond.GetStereo() == Chem.rdchem.BondStereo.STEREOE:
                entry['EZ_label'] = "E"
                entry['EZ_label_binary'] = 1
            elif bond.GetStereo() == Chem.rdchem.BondStereo.STEREOZ:
                entry['EZ_label'] = "Z"
                entry['EZ_label_binary'] = 0
            else:
                continue

    return entry

In [23]:
with Pool(processes=os.cpu_count()) as p:
    df_collection3 = list(p.imap(generate_labels, tqdm(df_collection2.iterrows(), total=len(df_collection2))))
df_collection3

100%|██████████| 2404/2404 [00:50<00:00, 47.49it/s] 


[ID                 C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           E
 EZ_label_binary                                                    1
 Name: 0, dtype: object,
 ID                 C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           Z
 EZ_label_binary                                                    0
 Name: 1, dtype: object,
 ID                 C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostereo    C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...
 EZ_label                                                           Z
 EZ_label_binary                                                    0
 Name: 2, dtype: object,
 ID                 C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...
 SMILES_nostere

In [24]:
df_collection3 = pd.DataFrame([elem for elem in df_collection3 if elem is not None])
df_collection3

Unnamed: 0,ID,SMILES_nostereo,EZ_label,EZ_label_binary
0,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
1,C#CCOc1ccc(\C=c2/sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,Z,0
2,C#CCOc1ccc(/C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,Z,0
3,C#CCOc1ccc(\C=c2\sc3n(c2=O)[C@@H](c2ccc(OC)cc2...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
32,C#CCOc1ccc(/C=c2/sc3n(c2=O)[C@H](c2ccc(OC)cc2)...,C#CCOc1ccc(C=c2sc3n(c2=O)C(c2ccc(OC)cc2)C(C(=O...,E,1
...,...,...,...,...
16483,[N-]=[N+]=Nc1ccc(\C=C2\C=CC=C[C@@H]2C(=O)c2ccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,Z,0
16500,[N-]=[N+]=Nc1ccc(/C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,Z,0
16501,[N-]=[N+]=Nc1ccc(\C=C2/C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,E,1
16502,[N-]=[N+]=Nc1ccc(/C=C2\C=CC=C[C@H]2C(=O)c2cccc...,[N-]=[N+]=Nc1ccc(C=C2C=CC=CC2C(=O)c2ccccc2)cc1,E,1


# Import finalized dataset

In [1]:
from ptgnn.dataset.ez_dataset import EZDataset

In [2]:
val_ds = EZDataset(root="hyperoptimization/src/ez", split='val')

Processing...
Split: val: 100%|██████████| 2374/2374 [00:36<00:00, 64.87it/s] 
Done!


In [5]:
test_ds = EZDataset(root="hyperoptimization/src/ez", split='test')

Starting download and dataset generation for split: test


test: filter: 100%|██████████| 11680/11680 [00:57<00:00, 204.51it/s]
test: isomer creation: 100%|██████████| 606/606 [00:08<00:00, 73.29it/s] 
test: label generation: 100%|██████████| 2640/2640 [00:28<00:00, 93.52it/s] 


Done!


Processing...
Split: test: 100%|██████████| 2640/2640 [00:39<00:00, 66.26it/s] 
Done!


In [6]:
train_ds = EZDataset(root="hyperoptimization/src/ez", split='train')

Starting download and dataset generation for split: train


train: filter: 100%|██████████| 55084/55084 [04:22<00:00, 209.84it/s]
train: isomer creation: 100%|██████████| 2804/2804 [00:11<00:00, 242.96it/s]
train: label generation: 100%|██████████| 11808/11808 [02:03<00:00, 95.71it/s] 


Done!


Processing...
Split: train: 100%|██████████| 11784/11784 [02:23<00:00, 82.23it/s] 
Done!


In [3]:
for elem in val_ds:
    display(elem)
    break

Data(x=[140, 118], edge_index=[2, 370], edge_attr=[370, 80], pos=[140, 6], parallel_node_index=[140], circle_index=[140], y=[1])

In [4]:
elem.y

tensor([1])