In [1]:
import os
import openeye.oechem as oechem

In [2]:
from collections import defaultdict
import re

def checkConsecutive(l): 
    # Source: https://tinyurl.com/y4h2dtd7
    return sorted(l) == list(range(min(l), max(l)+1)) 

def natural_sort(l): 
    # Source: https://tinyurl.com/y7kfa964
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

## Open the set of molecules written from QCArchive

In [3]:
infile = 'whole_01.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(infile):
    raise FileNotFoundError(f"Unable to open {infile} for reading")
mols = ifs.GetOEMols()

## Create a dictionary to identify separated molecules

Here we evaluate the SMILES string of each structure (dictionary key).

Then for each SMILES seen, we store a list of indices for where it's found in the whole QCArchive dataset (dictionary value). 

In [4]:
smi_dict = defaultdict(list)
tlist = []
index = 0

for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        smi_dict[oechem.OEMolToSmiles(conf)].append(index)
        tlist.append(conf.GetTitle())
        index += 1

## Determine which structures are split from non-consecutive indices

If there are non-consecutive indices, such as `5 6 7 30 31 32`, that means this same structure has been found six times. We want to group them together.

In [5]:
fix_dict = defaultdict(list)

for key, val in smi_dict.items():
    
    if not checkConsecutive(val):
        fix_dict[key] = val

In [6]:
# preview a few items in the dictionary
{k: fix_dict[k] for k in list(fix_dict)[:2]}

{'c1cc(ccc1C2C[NH2+]CCc3c2cc(c(c3Cl)O)O)O': [13, 14, 15, 4752, 4753, 4754],
 'c1cc(ccc1C2C[NH2+]CCc3c2cc(c(c3Cl)[O-])O)O': [16, 17, 18, 4755, 4756, 4757]}

## Dataset details
1. total conformers,
2. total molecules,
3. molecules with split up conformers

In [7]:
print('\n', len(tlist), len(smi_dict), len(fix_dict))


 26313 3683 159


## Write out lists of conformer titles by split mols and non-split mols

We want to reorganize and group the split molecules but the non-split molecules are good to go.    

In [8]:
tlist_redo = []
tlist_good = []

for key, val in fix_dict.items():
    for v in val:
        tlist_redo.append(tlist[v])

        
tlist_good = natural_sort(list(set(tlist).difference(tlist_redo)))
print(len(tlist_redo), len(tlist_good))

2022 24291


In [9]:
tlist_redo[:6]

['full_18', 'full_19', 'full_20', 'full_5108', 'full_5109', 'full_5110']

In [10]:
tlist_good[:6]

['full_1', 'full_2', 'full_3', 'full_4', 'full_5', 'full_6']

## Write titles to text files

In [11]:
with open('titles_redo.txt', 'w') as f:
    for item in tlist_redo:
        f.write("%s\n" % item)

In [12]:
with open('titles_good.txt', 'w') as f:
    for item in tlist_good:
        f.write("%s\n" % item)

## Extract molecules for each set of "redo" and "good"
I did this using molextract.py from OpenEye. The script can be found here:  
https://docs.eyesopen.com/toolkits/python/oechemtk/oechem_examples_summary.html

## For the molextract-ed SDF file of "titles_redo," group conformers by sorting indices

In [13]:
infile = 'whole_02_redo.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(infile):
    raise FileNotFoundError(f"Unable to open {infile} for reading")
mols = ifs.GetOEMols()

## Get indices of how conformers should be sorted using the order of the molecule titles

First get a list of all the molecules which are numerically sorted but not molecularly sorted (all same structures grouped together).

In [14]:
list_mols = []
list_names = []

for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        list_mols.append(oechem.OEGraphMol(conf))
        list_names.append(conf.GetTitle())

In [15]:
[list_mols[i].GetTitle() for i in range(6)]

['full_18', 'full_19', 'full_20', 'full_21', 'full_22', 'full_23']

Then for the molecule-grouped names in `tlist_redo`, get the list indices of how the molecules *should* be grouped.

In [16]:
index_redo = []
for t in tlist_redo:
    index_redo.append(list_names.index(t))

In [17]:
index_redo[:6]

[0, 1, 2, 1497, 1498, 1499]

## Sort the conformers by the numeric indices of the titles

In [18]:
sort_mols = [list_mols[i] for i in index_redo]

In [19]:
[sort_mols[i].GetTitle() for i in range(6)]

['full_18', 'full_19', 'full_20', 'full_5108', 'full_5109', 'full_5110']

## Write output file with mols sorted by structure (titles no longer in numeric order)

In [20]:
outfile = 'whole_03_redosort.sdf'

ofs = oechem.oemolostream()

if not ofs.open(outfile):
    oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)

for mol in sort_mols:
    oechem.OEWriteConstMolecule(ofs, mol)
ofs.close()

## Concatenate the "good" set written out previously with this fixed/grouped subset
`cat whole_02_good.sdf whole_03_redosort.sdf > whole_04_combine.sdf`

## Read in the whole set with proper conformers

In [None]:
infile = 'whole_04_combine.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(infile):
    raise FileNotFoundError(f"Unable to open {infile} for reading")
mols = ifs.GetOEMols()

## Rename titles for numeric order and write to file

In [None]:
# open an outstream file
outfile = 'whole_05_renew.sdf'
ofs = oechem.oemolostream()

if not ofs.open(outfile):
    oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)

for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        conf.SetTitle(f'full_{i+1}')
        oechem.OEWriteConstMolecule(ofs, conf)
ofs.close()