In [2]:
import pandas as pd
import numpy as np
import os
import re
import time
import threading
import subprocess

from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, Descriptors3D, ChemicalFeatures, GraphDescriptors, Lipinski, rdchem
from rdkit.Chem.rdchem import Mol, Atom, Bond
from rdkit.Chem.rdmolfiles import SDMolSupplier, SDWriter
from openbabel import pybel

if not os.path.exists('./temp'):
    os.mkdir('./temp')

In [3]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [4]:
files = [
    'edrug3d.sdf',
    'qm9-1.sdf',
    'qm9-2.sdf',
    'qm9-3.sdf',
    'qm9-4.sdf',
    'qm9-5.sdf',
    'qm9-6.sdf',
    'qm9-7.sdf',
    'qm9-8.sdf'
]


def check_missing_files():
    """Checks for missing files. Returns true, if all files are present."""
    for file in files:
        if not os.path.exists('./data/' + file):
            return False

    return True

In [5]:
# Download data

if not check_missing_files():
    !wget -nc -O data.zip "https://hochschulebonnrheinsieg-my.sharepoint.com/:u:/g/personal/nico_piel_365h-brs_de1/ESuGOTn_IflEk7I5HkOFpbwBZKeOk9Qf2nL5JEcq2om6_Q?e=sHYsTk&download=1"
    !unzip -u data.zip
    !rm data.zip

In [7]:
for file_name in tqdm(files):
    mols = SDMolSupplier('./data/' + file_name)

    file_name_split = file_name.split('.')[0]

    if not os.path.exists('./temp/' + file_name_split):
        os.mkdir('./temp/' + file_name_split)

    new_path = './temp/' + file_name_split + '/'

    sdwriter = SDWriter(new_path + 'mol.sdf')

    for mol in tqdm(mols):
        if mol is None:
            continue

        rdInfo = {}

        rdInfo['hvyAtCnt'] = Lipinski.HeavyAtomCount(mol)
        rdInfo['nhohC'] = Lipinski.NHOHCount(mol)
        rdInfo['noC'] = Lipinski.NOCount(mol)
        rdInfo['aliCC'] = Lipinski.NumAliphaticCarbocycles(mol)
        rdInfo['aliHC'] = Lipinski.NumAliphaticHeterocycles(mol)
        rdInfo['aliR'] = Lipinski.NumAliphaticRings(mol)
        rdInfo['aroCC'] = Lipinski.NumAromaticCarbocycles(mol)
        rdInfo['aroR'] = Lipinski.NumAromaticRings(mol)
        rdInfo['HA'] = Lipinski.NumHAcceptors(mol)
        rdInfo['HD'] = Lipinski.NumHDonors(mol)
        rdInfo['HAt'] = Lipinski.NumHeteroatoms(mol)
        rdInfo['rotatableBonds'] = Lipinski.NumRotatableBonds(mol)
        rdInfo['satCC'] = Lipinski.NumSaturatedCarbocycles(mol)
        rdInfo['satHC'] = Lipinski.NumSaturatedHeterocycles(mol)
        rdInfo['satR'] = Lipinski.NumSaturatedRings(mol)
        rdInfo['rC'] = Lipinski.RingCount(mol)
        rdInfo['molWt'] = Descriptors.MolWt(mol)
        rdInfo['hvyAtMolWt'] = Descriptors.HeavyAtomMolWt(mol)
        rdInfo['molMaxPC'] = Descriptors.MaxPartialCharge(mol)
        rdInfo['molMinPC'] = Descriptors.MinPartialCharge(mol)
        rdInfo['molRadElec'] = Descriptors.NumRadicalElectrons(mol)
        rdInfo['molValElec'] = Descriptors.NumValenceElectrons(mol)
        rdInfo['asph'] = Descriptors3D.Asphericity(mol)
        rdInfo['ecc'] = Descriptors3D.Eccentricity(mol)
        rdInfo['isf'] = Descriptors3D.InertialShapeFactor(mol)
        rdInfo['npr1'] = Descriptors3D.NPR1(mol)
        rdInfo['npr2'] = Descriptors3D.NPR2(mol)
        rdInfo['pmi'] = Descriptors3D.PMI1(mol)
        rdInfo['rog'] = Descriptors3D.RadiusOfGyration(mol)
        rdInfo['j'] = GraphDescriptors.BalabanJ(mol)
        rdInfo['ct'] = GraphDescriptors.BertzCT(mol)

        sdwriter.write(mol)

        # Run antechamber and divert output to a file (temporary)
        subprocess.getoutput(
            f"cd {new_path} && antechamber -i mol.sdf -fi mdl -o mol.ac -fo ac -at gaff2 -pf y"
        )

        with open(new_path + 'mol.ac') as file:
            lines = [a for a in file.readlines() if 'ATOM' in a]

            for atom in tqdm(Mol.GetAtoms(mol)):
                out = {}

                idx = Atom.GetIdx(atom)
                
                line = lines[idx]

                a_split = re.compile(r'\s+').split(line.strip())
                atom_name = a_split[2]
                if len(a_split) > 9:
                    atom_type = a_split[9]
                else:
                    atom_type = a_split[8]

                out['type'] = atom_type

                print(f'RDKit atom: {Atom.GetSymbol(atom)}')
                print(f'antechamber atom: {atom_name}')

                bonds = {
                    'C': 0,
                    'H': 0,
                    'N': 0,
                    'O': 0,
                    'S': 0,
                    'F': 0,
                    'Cl': 0,
                    'Br': 0,
                    'I': 0,
                    'Other': 0
                }

                for bond in tqdm(Atom.GetBonds(atom)):
                    bond_name = Atom.GetSymbol(Bond.GetEndAtom(bond))

                    if bond_name in bonds.keys():
                        bonds[bond_name] += 1
                    else:
                        bonds['Other'] += 1

                out.update(bonds)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/2028 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

S


  0%|          | 0/4 [00:00<?, ?it/s]

Begin atom: S
End atom: O
Begin atom: S
End atom: O
Begin atom: S
End atom: N
Begin atom: S
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: S
End atom: O
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: S
End atom: O
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: S
End atom: N
Begin atom: N
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: N
End atom: C
N


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: S
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: N
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C


  0%|          | 0/95 [00:00<?, ?it/s]

C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: N
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: C
C


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: O
Begin atom: C
End atom: N
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
N


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
Begin atom: N
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: N
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: N
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: N
N


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: C
End atom: N
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: N
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: O
Begin atom: C
End atom: C
N


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: N
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: N
Begin atom: C
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: N
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: N
Begin atom: N
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


  0%|          | 0/3 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: C
End atom: C
Begin atom: C
End atom: O
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
O


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: C
End atom: O


  0%|          | 0/8 [00:00<?, ?it/s]

N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: N
End atom: C
N


  0%|          | 0/1 [00:00<?, ?it/s]

Begin atom: N
End atom: C
N


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: N
End atom: C
Begin atom: N
End atom: C
C


  0%|          | 0/2 [00:00<?, ?it/s]

Begin atom: C
End atom: C
Begin atom: C
End atom: C
C


KeyboardInterrupt: 