In [1]:
import os
import openeye.oechem as oechem

In [2]:
from collections import defaultdict
import re

def checkConsecutive(l): 
    # Source: https://tinyurl.com/y4h2dtd7
    return sorted(l) == list(range(min(l), max(l)+1)) 

def natural_sort(l): 
    # Source: https://tinyurl.com/y7kfa964
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

## Open the set of molecules

In [3]:
outfile = 'archive/checkconfs.sdf'
outfile = 'whole.sdf'
#outfile = 'whole_04_combine.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(outfile):
    raise FileNotFoundError(f"Unable to open {outfile} for reading")
mols = ifs.GetOEMols()

## Create dictionary with SMILES key and index value

In [4]:
smi_dict = defaultdict(list)
tlist = []
index = 0

for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        smi_dict[oechem.OEMolToSmiles(conf)].append(index)
        tlist.append(conf.GetTitle())
        index += 1

## Determine which confs are split from non-consecutive indices

In [5]:
fix_dict = defaultdict(list)

for key, val in smi_dict.items():
    
    if not checkConsecutive(val):
        fix_dict[key] = val

In [6]:
fix_dict

defaultdict(list,
            {'c1cc(ccc1C2C[NH2+]CCc3c2cc(c(c3Cl)O)O)O': [13,
              14,
              15,
              4752,
              4753,
              4754],
             'c1cc(ccc1C2C[NH2+]CCc3c2cc(c(c3Cl)[O-])O)O': [16,
              17,
              18,
              4755,
              4756,
              4757],
             'COc1ccccc1OCC(CO)O': [38,
              39,
              40,
              41,
              42,
              43,
              44,
              45,
              46,
              47,
              48,
              49,
              50,
              51,
              52,
              53,
              54,
              55,
              56,
              57,
              5856,
              5857,
              5858,
              5859,
              5860,
              5861,
              5862,
              5863,
              5864,
              5865,
              5866,
              5867,
              5868,
              5869,
 

## Numbers of: total conformers, total molecules, molecules with split up conformers

In [7]:
print('\n', len(tlist), len(smi_dict), len(fix_dict))


 26313 3683 159


## Split all conformer titles by split (redo) and non-split (good)

In [8]:
tlist_redo = []
tlist_good = []

for key, val in fix_dict.items():
    for v in val:
        tlist_redo.append(tlist[v])

        
tlist_good = natural_sort(list(set(tlist).difference(tlist_redo)))
print(len(tlist_redo), len(tlist_good))

2022 24291


In [9]:
tlist_redo

['full_18',
 'full_19',
 'full_20',
 'full_5108',
 'full_5109',
 'full_5110',
 'full_21',
 'full_22',
 'full_23',
 'full_5111',
 'full_5112',
 'full_5113',
 'full_44',
 'full_45',
 'full_46',
 'full_47',
 'full_48',
 'full_49',
 'full_50',
 'full_51',
 'full_52',
 'full_53',
 'full_54',
 'full_55',
 'full_56',
 'full_57',
 'full_58',
 'full_59',
 'full_60',
 'full_61',
 'full_62',
 'full_63',
 'full_6228',
 'full_6229',
 'full_6230',
 'full_6231',
 'full_6232',
 'full_6233',
 'full_6234',
 'full_6235',
 'full_6236',
 'full_6237',
 'full_6238',
 'full_6239',
 'full_6240',
 'full_6241',
 'full_6242',
 'full_6243',
 'full_6244',
 'full_6245',
 'full_6246',
 'full_6247',
 'full_82',
 'full_250',
 'full_256',
 'full_83',
 'full_87',
 'full_245',
 'full_246',
 'full_247',
 'full_248',
 'full_249',
 'full_251',
 'full_252',
 'full_253',
 'full_254',
 'full_255',
 'full_257',
 'full_258',
 'full_259',
 'full_260',
 'full_261',
 'full_262',
 'full_263',
 'full_264',
 'full_265',
 'full_266',
 '

In [10]:
tlist_good

['full_1',
 'full_2',
 'full_3',
 'full_4',
 'full_5',
 'full_6',
 'full_7',
 'full_8',
 'full_9',
 'full_10',
 'full_14',
 'full_16',
 'full_17',
 'full_24',
 'full_25',
 'full_26',
 'full_27',
 'full_28',
 'full_29',
 'full_30',
 'full_31',
 'full_32',
 'full_33',
 'full_34',
 'full_35',
 'full_36',
 'full_37',
 'full_38',
 'full_40',
 'full_41',
 'full_42',
 'full_43',
 'full_64',
 'full_65',
 'full_66',
 'full_67',
 'full_68',
 'full_69',
 'full_70',
 'full_71',
 'full_72',
 'full_73',
 'full_74',
 'full_75',
 'full_76',
 'full_77',
 'full_78',
 'full_79',
 'full_80',
 'full_81',
 'full_84',
 'full_85',
 'full_86',
 'full_88',
 'full_105',
 'full_112',
 'full_114',
 'full_115',
 'full_116',
 'full_117',
 'full_118',
 'full_119',
 'full_120',
 'full_121',
 'full_122',
 'full_123',
 'full_124',
 'full_125',
 'full_126',
 'full_127',
 'full_128',
 'full_129',
 'full_130',
 'full_131',
 'full_132',
 'full_133',
 'full_135',
 'full_136',
 'full_137',
 'full_138',
 'full_139',
 'full_140

## Write titles to text files

In [11]:
with open('titles_redo.txt', 'w') as f:
    for item in tlist_redo:
        f.write("%s\n" % item)

In [12]:
with open('titles_good.txt', 'w') as f:
    for item in tlist_good:
        f.write("%s\n" % item)

## For molextract-ed SDF of titles_redo, sort to group conformers

In [13]:
outfile = 'whole_02_redo.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(outfile):
    raise FileNotFoundError(f"Unable to open {outfile} for reading")
mols = ifs.GetOEMols()

## Get indices of how confs should be sorted using the order of the titles

In [14]:
list_mols = []
list_names = []

for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        list_mols.append(oechem.OEGraphMol(conf))
        list_names.append(conf.GetTitle())

In [15]:
[list_mols[i].GetTitle() for i in range(4)]

['full_18', 'full_19', 'full_20', 'full_21']

In [16]:
index_redo = []
for t in tlist_redo:
    index_redo.append(list_names.index(t))

In [17]:
index_redo

[0,
 1,
 2,
 1497,
 1498,
 1499,
 3,
 4,
 5,
 1500,
 1501,
 1502,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 1907,
 1908,
 1909,
 1910,
 1911,
 1912,
 1913,
 1914,
 1915,
 1916,
 1917,
 1918,
 1919,
 1920,
 1921,
 1922,
 1923,
 1924,
 1925,
 1926,
 26,
 34,
 40,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 35,
 36,
 37,
 38,
 39,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 1010,
 1011,
 1012,
 1013,
 1014,
 1015,
 1016,
 1017,
 1018,
 1019,
 1020,
 1021,
 1022,
 1023,
 1024,
 1025,
 1026,
 1027,
 1028,
 1029,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 1030,
 1031,
 1032,
 1033,
 1034,
 1035,
 1036,
 1037,
 1038,
 1039,
 1040,
 1041,
 1042,
 1043,
 1044,
 1045,
 1046,
 1047,
 1048,
 1049,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 533,
 534,
 535,
 536,
 537,
 538,
 539,
 540,
 541,
 542,
 543,
 92,
 93,
 94,
 689,
 

## Sort the confs by the indices of the titles

In [18]:
sort_mols = [list_mols[i] for i in index_redo]

In [19]:
[sort_mols[i].GetTitle() for i in range(4)]

['full_18', 'full_19', 'full_20', 'full_5108']

## Write output file with mols sorted by conformer (titles no longer in numeric order)

In [20]:
# open an outstream file
outfile = 'whole_03_redosort.sdf'
ofs = oechem.oemolostream()
#if os.path.exists(outfile):
#    raise FileExistsError("Output file {} already exists in {}".format(
#        outfile, os.getcwd()))
if not ofs.open(outfile):
    oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)


for mol in sort_mols:
    # write molecule to file
    oechem.OEWriteConstMolecule(ofs, mol)
ofs.close()

## Read in the whole set with proper conformers

In [27]:
outfile = 'whole_04_combine.sdf'

ifs = oechem.oemolistream()

# OEDefaultConfTest, OEIsomericConfTest, OEOmegaConfTest, OEAbsoluteConfTest, OEAbsCanonicalConfTest
# https://docs.eyesopen.com/toolkits/python/oechemtk/oemol.html
ifs.SetConfTest(oechem.OEAbsCanonicalConfTest()) 

if not ifs.open(outfile):
    raise FileNotFoundError(f"Unable to open {outfile} for reading")
mols = ifs.GetOEMols()

## Rename titles for numeric order and write to file

In [28]:
# open an outstream file
outfile = 'whole_05_renew.sdf'
ofs = oechem.oemolostream()
#if os.path.exists(outfile):
#    raise FileExistsError("Output file {} already exists in {}".format(
#        outfile, os.getcwd()))
if not ofs.open(outfile):
    oechem.OEThrow.Fatal("Unable to open %s for writing" % outfile)


for i, mol in enumerate(mols):
    for j, conf in enumerate(mol.GetConfs()):
        conf.SetTitle(f'full_{i+1}')
        oechem.OEWriteConstMolecule(ofs, conf)
ofs.close()