# Collect set of molecules which were included in benchmark

(Author: David L. Mobley)

As I understand it, this directory contains molecules which are in our full benchmark set, in `set`, as well as molecules which were removed from that due to various failures (in the `issues` directory). We would like to know how many final molecules are in the final set, which is not currently info available in this repo. Here, I will attempt to build an authoritative list of only those molecules in the full final set.

## Grab list of initial molecules

In [14]:
from openeye import oechem


full_set_smiles = set()
ifile = 'set/full_opt_benchmark1.smi'
file = open(ifile, 'r')
text = file.readlines()
file.close()

# Read SMILES strings
init_smiles = set()
for line in text:
    # Store line less trailing index/conformer number thingie
    idx = -1
    line = line.strip()
    while line[idx].isdigit():
        idx-=1
        #print(idx)
    smi = line[:idx]
    #print(line, smi)
    init_smiles.add(smi)

#Make into canonical isomeric SMILES, unique
for smi in list(init_smiles):
    #print(smi)
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smi)
    if mol.NumAtoms()>0:
        oechem.OEAssignAromaticFlags(mol)
        full_set_smiles.add(oechem.OEMolToSmiles(mol))

print("Obtained %s molecules" % len(full_set_smiles))

Obtained 3859 molecules


## Obtain and remove those molecules which were cleaned out by various things

In [15]:
import glob
sdffiles = glob.glob('issues/*/*.sdf')

removed_smiles = set()
for filename in sdffiles:
    istream = oechem.oemolistream(filename)
    mols = istream.GetOEMols()
    for mol in mols:
        oechem.OEAssignAromaticFlags(mol)
        smi = oechem.OEMolToSmiles(mol)
        removed_smiles.add(smi)
        
print(len(removed_smiles))

121


In [16]:
for item in removed_smiles:
    if item in full_set_smiles:
        full_set_smiles.remove(item)
    
print("Final number of molecules %s" % len(full_set_smiles))

Final number of molecules 3806


It seems slightly odd that I'm claiming to have removed 121 molecules but some of these are not in the set. It's possible I'm having some kind of canonicalization issue.

In [17]:
## Obtain and remove molecules which were in OpenFF 1.2 training set
smifiles = glob.glob('overlapping-set/*.smi')

removed_smiles = set()
for filename in smifiles:
    istream = oechem.oemolistream(filename)
    mols = istream.GetOEMols()
    for mol in mols:
        oechem.OEAssignAromaticFlags(mol)
        smi = oechem.OEMolToSmiles(mol)
        removed_smiles.add(smi)
        
print(len(removed_smiles))

for item in removed_smiles:
    if item in full_set_smiles:
        full_set_smiles.remove(item)
    
print("Final number of molecules %s" % len(full_set_smiles))

419
Final number of molecules 3574


This is also odd; I seem to be having the same canonicalization issue as above where it's not removing the number of molecules I expect to have removed. 

## Store set PDFS

In [18]:
# Generate PDF files visualizing the molecules

from openeye import oedepict
import shutil
import os
if os.path.isdir('molecule_PDFs'): shutil.rmtree('molecule_PDFs')
os.mkdir('molecule_PDFs')


oemols = []
for smi in full_set_smiles:
    mol = oechem.OEMol()
    oechem.OESmilesToMol(mol, smi)
    oemols.append(mol)

itf = oechem.OEInterface()
PageByPage = True
suppress_h = True
rows = 10
cols = 6
ropts = oedepict.OEReportOptions(rows, cols)
ropts.SetHeaderHeight(25)
ropts.SetFooterHeight(25)
ropts.SetCellGap(2)
ropts.SetPageMargins(10)
report = oedepict.OEReport(ropts)
cellwidth, cellheight = report.GetCellWidth(), report.GetCellHeight()
opts = oedepict.OE2DMolDisplayOptions(cellwidth, cellheight, oedepict.OEScale_Default * 0.5)
opts.SetAromaticStyle(oedepict.OEAromaticStyle_Circle)
pen = oedepict.OEPen(oechem.OEBlack, oechem.OEBlack, oedepict.OEFill_On, 1.0)
opts.SetDefaultBondPen(pen)
oedepict.OESetup2DMolDisplayOptions(opts, itf)
for i, mol in enumerate(oemols):
    cell = report.NewCell()
    mol_copy = oechem.OEMol(mol)
    oedepict.OEPrepareDepiction(mol_copy, False, suppress_h)
    disp = oedepict.OE2DMolDisplay(mol_copy, opts)

    oedepict.OERenderMolecule(cell, disp)

oedepict.OEWriteReport("molecule_PDFs/molecules.pdf" , report)

True

In [20]:
ofile = oechem.oemolostream('final_set.smi')
for mol in oemols:
    oechem.OEWriteConstMolecule(ofile, mol)
ofile.close()