In [10]:
import pandas as pd
import numpy as np
import os
import re
import time

from rdkit import Chem
from openbabel import pybel

In [11]:
def check_missing_files():
    """Checks for missing files. Returns true, if all files are present."""
    files = [
        'edrug3d.sdf',
        'qm9-1.sdf',
        'qm9-2.sdf',
        'qm9-3.sdf',
        'qm9-4.sdf',
        'qm9-5.sdf',
        'qm9-6.sdf',
        'qm9-7.sdf',
        'qm9-8.sdf'
    ]

    for file in files:
        if not os.path.exists('./data/' + file):
            return False

    return True

In [12]:
# Download data

if not check_missing_files():
    !wget -nc -O data.zip "https://hochschulebonnrheinsieg-my.sharepoint.com/:u:/g/personal/nico_piel_365h-brs_de1/ESuGOTn_IflEk7I5HkOFpbwBZKeOk9Qf2nL5JEcq2om6_Q?e=BuUnkT&download=1"
    !unzip -u data.zip
    !rm data.zip

In [13]:
def ac_to_series(smiles: str, filename: str) -> pd.DataFrame:
    with open(filename) as file:
        lines = file.readlines()

        out = []

        for line in lines:
            if ('ATOM' in line):
                strng = str(line[-3:]).strip()
                out.append(strng)

        output = '|'.join(out)

        return pd.DataFrame.from_dict({'smiles': [smiles], 'types': [output]})

In [14]:
def sdf_to_list(filename: str) -> list:
    """Converts an sdf file to a Python list."""
    with open(filename, "rt") as file:
        return file.read().split(r'$$$$')

In [15]:
mols = sdf_to_list('./data/edrug3d.sdf')
df = pd.DataFrame(columns=['smiles', 'types'])

In [16]:
for mol in mols:
    # Strip leading new lines
    mol = mol.lstrip()

    # Split on new lines to correct for mistakes made by splitting the sdf molecules
    split = mol.split('\n')
    curr_split_len = len(split)

    if curr_split_len > 5:
        # Inserts a new line if line 4 isn't in the correct place
        if re.compile(r'\s*\d+\s*\d+\s*\d+\s*\d+\s*').match(split[3]) is None:
            mol = '\n' + mol

        # Writes molecule to a file so antechamber can read it
        with open('mol.sdf', 'w') as file:
            file.write(mol)

        # Create an openbabel molecule
        py_mol = pybel.readstring('sdf', mol)

        # Run antechamber and divert output to a file (temporary)
        !antechamber -i mol.sdf -fi mdl -o mol.ac -fo ac -at gaff2 -pf y > .log

        # Sleep so antechamber can finish (requires experimentation)
        time.sleep(0.5)

        smiles = py_mol.write('smi')

        smiles_only = re.compile(r'\s+').split(smiles)[0]

        ac_df = ac_to_series(smiles_only, 'mol.ac')

        df = pd.concat([df, ac_df], ignore_index=True)
    else:
        print('end')

try:
    os.remove('mol.ac')
    os.remove('mol.sdf')
except IOError:
    print('Something went wrong.')

/home/caigh/miniconda3/envs/mm/bin/wrapped_progs/antechamber: Fatal Error!
Weird atomic valence (2) for atom (ID: 6, Name: N4).
       Possible open valence.
/home/caigh/miniconda3/envs/mm/bin/wrapped_progs/antechamber: Fatal Error!
Weird atomic valence (2) for atom (ID: 9, Name: N5).
       Possible open valence.
/home/caigh/miniconda3/envs/mm/bin/wrapped_progs/antechamber: Fatal Error!
GAFF does not have sufficient parameters for molecules having unusual
       elements (those other than H,C,N,O,S,P and halogens).
       To ensure antechamber works properly, one may need to designate
       bond types for bonds involved with unusual elements.
       To do so, simply freeze the bond types by appending "F" or "f" 
       to the corresponding bond types in ac or mol2 files
       and rerun antechamber without unusual element checking via:
       antechamber -dr no 
       Alternatively for metals, see metalpdb2mol2.py in MCPB. 

/home/caigh/miniconda3/envs/mm/bin/wrapped_progs/antechamb

In [18]:
df.to_csv('./data/edrug3d.csv', index=False)