In [None]:
import pandas as pd
import numpy as np
import os
import re
import time
import threading
import subprocess

from tqdm.notebook import tqdm
from rdkit import Chem
from openbabel import pybel
from pqdm.processes import pqdm

if not os.path.exists('./temp'):
    os.mkdir('./temp')

In [None]:
files = [
    'edrug3d.sdf',
    'qm9-1.sdf',
    'qm9-2.sdf',
    'qm9-3.sdf',
    'qm9-4.sdf',
    'qm9-5.sdf',
    'qm9-6.sdf',
    'qm9-7.sdf',
    'qm9-8.sdf'
]


def check_missing_files():
    """Checks for missing files. Returns true, if all files are present."""
    for file in files:
        if not os.path.exists('./data/' + file):
            return False

    return True

In [None]:
# Download data

if not check_missing_files():
    !wget -nc -O data.zip "https://hochschulebonnrheinsieg-my.sharepoint.com/:u:/g/personal/nico_piel_365h-brs_de1/ESuGOTn_IflEk7I5HkOFpbwBZKeOk9Qf2nL5JEcq2om6_Q?e=sHYsTk&download=1"
    !unzip -u data.zip
    !rm data.zip

In [None]:
def ac_to_series(filename: str) -> pd.DataFrame:
    df_out = None

    with open(filename) as file:
        lines = file.readlines()

        for line in [a for a in lines if 'ATOM' in a]:
            try:
                out = {}

                a_split = re.compile(r'\s+').split(line.strip())
                atom_name = a_split[2]
                atom_type = a_split[9]
                out['name'] = atom_name
                out['type'] = atom_type

                bonds = {
                    'C': 0,
                    'H': 0,
                    'N': 0,
                    'O': 0,
                    'S': 0,
                    'F': 0,
                    'Cl': 0,
                    'Br': 0,
                    'I': 0,
                    'Other': 0
                }

                for bond in [b for b in lines if 'BOND' in b]:
                    b_split = re.compile(r'\s+').split(bond.strip())

                    if b_split[5] == atom_name:
                        bond_type = re.compile('\d+').split(b_split[6])[0]

                        if bond_type in bonds.keys():
                            bonds[bond_type] += 1
                        else:
                            bonds['Other'] += 1

                    elif b_split[6] == atom_name:
                        bond_type = re.compile('\d+').split(b_split[5])[0]

                        if bond_type in bonds.keys():
                            bonds[bond_type] += 1
                        else:
                            bonds['Other'] += 1

                out.update(bonds)
                bond_df = pd.DataFrame(out, index=[0])

                if df_out is None:
                    df_out = pd.DataFrame(columns=bond_df.columns)

                df_out = pd.concat([df_out, bond_df], ignore_index=True)
            except IndexError:
                print(f'Index out of range in {filename}, line: {a}')
                print(f'a_split only has {len(a_split)} elements.')

    return df_out

In [None]:
def sdf_to_list(filename: str) -> list:
    """Converts an sdf file to a Python list."""
    with open(filename, "rt") as file:
        return file.read().split(r'$$$$')

In [None]:
df_list = []


def sdf_to_df(file_name: str) -> pd.DataFrame:
    mols = sdf_to_list('./data/' + file_name)
    df = None
    file_name_split = file_name.split('.')[0]

    if not os.path.exists('./temp/' + file_name_split):
        os.mkdir('./temp/' + file_name_split)

    new_path = './temp/' + file_name_split + '/'

    for mol in mols:
        # Strip leading new lines
        mol = mol.lstrip()

        # Split on new lines to correct for mistakes made by splitting the sdf molecules
        split = mol.split('\n')
        curr_split_len = len(split)

        if curr_split_len > 5:
            # Insert a new line if line 4 isn't in the correct place
            if re.compile(r'\s*\d+\s*\d+\s*\d+\s*\d+\s*').match(split[3]) is None:
                mol = '\n' + mol

            # Write molecule to a file so antechamber can read it
            with open(new_path + 'mol.sdf', 'w') as file:
                file.write(mol)

            # Run antechamber and divert output to a file (temporary)
            subprocess.getoutput(
                f"cd {new_path} && antechamber -i mol.sdf -fi mdl -o mol.ac -fo ac -at gaff2 -pf y"
            )

            # Convert ac atom types to a new dataframe
            ac_df = ac_to_series(new_path + 'mol.ac')

            if df is None:
                df = pd.DataFrame(columns=ac_df.columns)

            # Attach new row to existing dataframe
            df = pd.concat([df, ac_df], ignore_index=True)
        else:
            print('end')

    # Clean up remaining files
    try:
        os.remove(new_path + 'mol.ac')
        os.remove(new_path + 'mol.sdf')
    except IOError:
        print('Something went wrong.')

    # Attach completed dataframe to the output frame.
    df_list.append(df)

    return df

In [None]:
class PreprocessingThread(threading.Thread):
    def __init__(self, file_name):
        threading.Thread.__init__(self)
        self.file_name = file_name

    def run(self):
        print("Starting " + self.file_name)
        sdf_to_df(self.file_name)
        print("Exiting " + self.file_name)

In [None]:
threads = []  # list of threads so they can be manipulated later

pqdm(files, sdf_to_df, n_jobs=len(files))

# Create a thread for each file
# for file in files:
#    thread = PreprocessingThread(file)
#    threads.append(thread)
# Start threads
# for thread in threads:
#    thread.start()
# Wait for them to finish
# for thread in threads:
#    thread.join()

# Concatenate dfs in the list
mol_df = pd.concat(df_list, ignore_index=True)

In [None]:
mol_df.to_csv('./data/data.csv', index=False)