In [24]:
import pandas as pd
import numpy as np
import os
import re
import time
import threading
import subprocess

from rdkit import Chem
from openbabel import pybel

if not os.path.exists('./temp'):
    os.mkdir('./temp')

In [25]:
files = [
    'edrug3d.sdf',
    'qm9-1.sdf',
    'qm9-2.sdf',
    'qm9-3.sdf',
    'qm9-4.sdf',
    'qm9-5.sdf',
    'qm9-6.sdf',
    'qm9-7.sdf',
    'qm9-8.sdf'
]


def check_missing_files():
    """Checks for missing files. Returns true, if all files are present."""
    for file in files:
        if not os.path.exists('./data/' + file):
            return False

    return True

In [26]:
# Download data

if not check_missing_files():
    !wget -nc -O data.zip "https://hochschulebonnrheinsieg-my.sharepoint.com/:u:/g/personal/nico_piel_365h-brs_de1/ESuGOTn_IflEk7I5HkOFpbwBZKeOk9Qf2nL5JEcq2om6_Q?e=BuUnkT&download=1"
    !unzip -u data.zip
    !rm data.zip

In [27]:
def ac_to_series(smiles: str, filename: str) -> pd.DataFrame:
    with open(filename) as file:
        lines = file.readlines()

        out = []

        for line in lines:
            if ('ATOM' in line):
                strng = str(line[-3:]).strip()
                out.append(strng)

        output = '|'.join(out)

        return pd.DataFrame.from_dict({'smiles': [smiles], 'types': [output]})

In [28]:
def sdf_to_list(filename: str) -> list:
    """Converts an sdf file to a Python list."""
    with open(filename, "rt") as file:
        return file.read().split(r'$$$$')

In [29]:
mol_df = pd.DataFrame(columns=['smiles', 'types'])

In [30]:
df_list = []

def sdf_to_df(file_name: str, data_frame: pd.DataFrame) -> pd.DataFrame:
    mols = sdf_to_list('./data/' + file_name)
    df = pd.DataFrame(columns=['smiles', 'types'])
    file_name_split = file_name.split('.')[0]

    if not os.path.exists('./temp/' + file_name_split):
        os.mkdir('./temp/' + file_name_split)

    new_path = './temp/' + file_name_split + '/'

    for mol in mols:
        # Strip leading new lines
        mol = mol.lstrip()

        # Split on new lines to correct for mistakes made by splitting the sdf molecules
        split = mol.split('\n')
        curr_split_len = len(split)

        if curr_split_len > 5:
            # Insert a new line if line 4 isn't in the correct place
            if re.compile(r'\s*\d+\s*\d+\s*\d+\s*\d+\s*').match(split[3]) is None:
                mol = '\n' + mol

            # Write molecule to a file so antechamber can read it
            with open(new_path + 'mol.sdf', 'w') as file:
                file.write(mol)

            # Create an openbabel molecule
            py_mol = pybel.readstring('sdf', mol)

            # Run antechamber and divert output to a file (temporary)
            subprocess.getoutput(
                f"cd {new_path} && antechamber -i mol.sdf -fi mdl -o mol.ac -fo ac -at gaff2 -pf y"
            )

            # Write molecule to a SMILES string
            smiles = py_mol.write('smi')

            # Split SMILES to only include the actual string. The drug's name is otherwise attached to it for some reason.
            smiles_only = re.compile(r'\s+').split(smiles)[0]

            # Convert ac atom types to a new dataframe
            ac_df = ac_to_series(smiles_only, new_path + 'mol.ac')

            # Attach new row to existing dataframe
            df = pd.concat([df, ac_df], ignore_index=True)
        else:
            print('end')

    # Clean up remaining files
    try:
        os.remove(new_path + 'mol.ac')
        os.remove(new_path + 'mol.sdf')
    except IOError:
        print('Something went wrong.')

    # Attach completed dataframe to the output frame.
    df_list.append(df)

    return df

In [31]:
class PreprocessingThread(threading.Thread):
    def __init__(self, file_name, data_frame):
        threading.Thread.__init__(self)
        self.file_name = file_name
        self.data_frame = data_frame

    def run(self):
        print("Starting " + self.file_name)
        sdf_to_df(self.file_name, self.data_frame)
        print("Exiting " + self.file_name)

In [None]:
threads = []  # list of threads so they can be manipulated later

# Create a thread for each file
for file in files:
    thread = PreprocessingThread(file, mol_df)
    threads.append(thread)
# Start threads
for thread in threads:
    thread.start()
# Wait for them to finish
for thread in threads:
    thread.join()

# Concatenate dfs in the list
mol_df = pd.concat(df_list, ignore_index=True)

Starting edrug3d.sdf
Starting qm9-1.sdf
Starting qm9-2.sdf
Starting qm9-3.sdf
Starting qm9-4.sdf
Starting qm9-5.sdf
Starting qm9-6.sdf
Starting qm9-7.sdf
Starting qm9-8.sdf


In [None]:
mol_df.to_csv('./data/data.csv', index=False)