In [36]:
import pandas as pd
import numpy as np
import os
import re
import time
import threading
import subprocess

from rdkit import Chem
from openbabel import pybel

if not os.path.exists('./temp'):
    os.mkdir('./temp')

In [37]:
files = [
    'edrug3d.sdf',
    'qm9-1.sdf',
    'qm9-2.sdf',
    'qm9-3.sdf',
    'qm9-4.sdf',
    'qm9-5.sdf',
    'qm9-6.sdf',
    'qm9-7.sdf',
    'qm9-8.sdf'
]


def check_missing_files():
    """Checks for missing files. Returns true, if all files are present."""
    for file in files:
        if not os.path.exists('./data/' + file):
            return False

    return True

In [38]:
# Download data

if not check_missing_files():
    !wget -nc -O data.zip "https://hochschulebonnrheinsieg-my.sharepoint.com/:u:/g/personal/nico_piel_365h-brs_de1/ESuGOTn_IflEk7I5HkOFpbwBZKeOk9Qf2nL5JEcq2om6_Q?e=BuUnkT&download=1"
    !unzip -u data.zip
    !rm data.zip

In [39]:
def ac_to_series(smiles: str, filename: str) -> pd.DataFrame:
    with open(filename) as file:
        lines = file.readlines()

        out = []

        for line in lines:
            if ('ATOM' in line):
                strng = str(line[-3:]).strip()
                out.append(strng)

        output = '|'.join(out)

        return pd.DataFrame.from_dict({'smiles': [smiles], 'types': [output]})

In [40]:
def sdf_to_list(filename: str) -> list:
    """Converts an sdf file to a Python list."""
    with open(filename, "rt") as file:
        return file.read().split(r'$$$$')

In [41]:
mol_df = pd.DataFrame(columns=['smiles', 'types'])

In [42]:
def sdf_to_df(file_name: str, data_frame: pd.DataFrame) -> pd.DataFrame:
    mols = sdf_to_list('./data/' + file_name)
    df = pd.DataFrame(columns=['smiles', 'types'])
    file_name_split = file_name.split('.')[0]

    if not os.path.exists('./temp/' + file_name_split):
        os.mkdir('./temp/' + file_name_split)

    new_path = './temp/' + file_name_split + '/'

    for mol in mols:
        # Strip leading new lines
        mol = mol.lstrip()

        # Split on new lines to correct for mistakes made by splitting the sdf molecules
        split = mol.split('\n')
        curr_split_len = len(split)

        if curr_split_len > 5:
            # Inserts a new line if line 4 isn't in the correct place
            if re.compile(r'\s*\d+\s*\d+\s*\d+\s*\d+\s*').match(split[3]) is None:
                mol = '\n' + mol

            # Writes molecule to a file so antechamber can read it
            with open(new_path + 'mol.sdf', 'w') as file:
                file.write(mol)

            # Create an openbabel molecule
            py_mol = pybel.readstring('sdf', mol)

            # Run antechamber and divert output to a file (temporary)

            subprocess.getoutput(
                f"antechamber -i {new_path}mol.sdf -fi mdl -o {new_path}mol.ac -fo ac -at gaff2 -pf y"
            )

            # Sleep so antechamber can finish (requires experimentation)
            # time.sleep(0.5)
            smiles = py_mol.write('smi')

            smiles_only = re.compile(r'\s+').split(smiles)[0]

            ac_df = ac_to_series(smiles_only, new_path + 'mol.ac')

            df = pd.concat([df, ac_df], ignore_index=True)
        else:
            print('end')

    try:
        os.remove('mol.ac')
        os.remove('mol.sdf')
    except IOError:
        print('Something went wrong.')

    data_frame = pd.concat([data_frame, df], ignore_index=True)

    return df

In [43]:
class PreprocessingThread(threading.Thread):
    def __init__(self, file_name, data_frame):
        threading.Thread.__init__(self)
        self.file_name = file_name
        self.data_frame = data_frame

    def run(self):
        print("Starting " + self.file_name)
        sdf_to_df(self.file_name, self.data_frame)
        print("Exiting " + self.file_name)

In [44]:
threads = []
for file in files:
    thread = PreprocessingThread(file, mol_df)
    threads.append(thread)
for thread in threads:
    thread.start()
for thread in threads:
    thread.join()

Starting edrug3d.sdf
Starting qm9-1.sdf
Starting qm9-2.sdf
Starting qm9-3.sdf
Starting qm9-4.sdf
Starting qm9-5.sdf
Starting qm9-6.sdf
Starting qm9-7.sdf
Starting qm9-8.sdf


Exception in thread Thread-43:
Traceback (most recent call last):
  File "/home/caigh/miniconda3/envs/mm/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-49:
Traceback (most recent call last):
  File "/home/caigh/miniconda3/envs/mm/lib/python3.9/threading.py", line 973, in _bootstrap_inner
Exception in thread Thread-41:
Traceback (most recent call last):
  File "/home/caigh/miniconda3/envs/mm/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/tmp/ipykernel_928/2876586803.py", line 9, in run
Exception in thread Thread-47:
Traceback (most recent call last):
  File "/home/caigh/miniconda3/envs/mm/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/tmp/ipykernel_928/2876586803.py", line 9, in run
    self.run()
  File "/tmp/ipykernel_928/2876586803.py", line 9, in run
  File "/tmp/ipykernel_928/4002033763.py", line 43, in sdf_to_df
    self.run()
  File "/tmp/ipykernel_928/2876586803.py", li

In [45]:
mol_df.to_csv('./data/data.csv', index=False)