In [1]:
import pandas as pd
import numpy as np
import os
import re
import subprocess
from glob import glob

from tqdm.notebook import tqdm
from rdkit import Chem
from rdkit.Chem import Descriptors, Descriptors3D, ChemicalFeatures, GraphDescriptors, Lipinski, rdchem
from rdkit.Chem.rdchem import Mol, Atom, Bond
from rdkit.rdBase import BlockLogs
import datamol as dm
import multiprocessing
from random import randrange
from sklearn.preprocessing import LabelEncoder

temp_path = ''

try:
    temp_path = os.environ['TMPDIR']
except KeyError:
    temp_path = './temp'

print(temp_path)

data_path = './data'
graph_data_path = './graph_data'

if not os.path.exists(temp_path):
    os.mkdir(temp_path)

if not os.path.exists(data_path):
    os.mkdir(data_path)

if not os.path.exists(graph_data_path):
    os.mkdir(graph_data_path)

./temp


In [2]:
def check_missing_files(self) -> None:
    """
    Checks for missing .xyz QM9 files

    Return
        True | False: True, if all files are accounted for
    """
    for file in tqdm(xyz_filepath_list):
        if not os.path.exists(file):
            print('Retrieving QM9..')
            !wget https://figshare.com/ndownloader/files/3195389 -O ./data/data.bz2
            !tar -xfu ./data/data.bz2
            !rm ./data/*.bz2

    if not os.path.exists(f'{data_path}/edrug3d.sdf'):
        print('Retrieving edrug3d..')
        !wget https://chemoinfo.ipmc.cnrs.fr/TMP/tmp.33880/e-Drug3D_2056.sdf -O ./data/edrug3d.sdf

    return None

In [3]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [4]:
def init():
    global index
    index = 0

    global line_split_re
    line_split_re = re.compile(r'\s+')

    manager = multiprocessing.Manager()

    global xyz_filepath_list
    xyz_filepath_list = list(glob(f'{data_path}/*.xyz'))

    xyz_filepath_list.sort()
    print('total xyz filepath # ', len(xyz_filepath_list))
    xyz_filepath_list[0]

    global molecules
    molecules = manager.list()
    global graphs_list
    graphs_list = manager.list()
    global nodes_list
    nodes_list = manager.list()
    global bonds_list
    bonds_list = manager.list()

In [5]:
def xyz2mol(m):
    """Convert the SMILES string inside QM9 .xyz files to an RDKit molecule."""
    blockedLogs = BlockLogs()

    with open(m) as file:
        lines = file.readlines()

        smiles = lines[len(lines) - 2].split()[0]
        smiles = dm.standardize_smiles(smiles)

        mol = dm.to_mol(smiles, add_hs=True, sanitize=True, ordered=True)
        mol = dm.fix_mol(mol)
        dm.align.compute_2d_coords(mol)
        # dm.conformers.generate(mol)

        global molecules
        molecules.append(mol)


def e3d2mol():
    """Convert edrug3d to RDKit molecules."""
    blockedLogs = BlockLogs()

    print('Converting edrug3d to mols..')

    mols = dm.read_sdf(f'{data_path}/edrug3d.sdf', remove_hs=False)
    global molecules
    molecules.extend(mols)

    print('Done!')

In [6]:
def preprocess_mol(mol: rdchem.Mol):
    global index
    index += 1

    mol = Chem.AddHs(mol)
    mol.Compute2DCoords()

    mol_nodes_df = None
    mol_bonds_df = None

    Chem.MolToPDBFile(mol, f'{temp_path}/{index}_mol.pdb')

    # Run antechamber and divert output to a file (temporary)
    subprocess.getoutput(
        f'cd {temp_path} && antechamber -i {index}_mol.pdb -fi pdb -o {index}_mol.ac -fo ac -at gaff2 -pf y'
    )

    # Compute molecule-level info
    graphs_args = {
        'graph_id': index
    }

    mol.ComputeGasteigerCharges()

    mol_graph_df = pd.DataFrame(graphs_args, index=[0])

    with open(f'{temp_path}/{index}_mol.ac') as file:
        lines = [a for a in file.readlines() if 'ATOM' in a]

        for atom in mol.GetAtoms():
            atom: rdchem.Atom

            atom_idx = atom.GetIdx()

            line = lines[atom_idx]

            a_split = line_split_re.split(line.strip())

            if len(a_split) > 9:
                atom_type = a_split[9]
            else:
                atom_type = a_split[8]

            atom_args = {
                'graph_id': index,
                'node_id': atom_idx,
                'type': atom_type,
                'label': atom.GetSymbol(),
                'depDeg': atom.GetDegree(),
                'forC': atom.GetProp('_GasteigerCharge'),
                'isA': float(atom.GetIsAromatic()),
                'totDeg': atom.GetTotalDegree(),
                'totH': atom.GetTotalNumHs(),
                'totV': atom.GetTotalValence(),
                'isR': float(atom.IsInRing()),
            }

            # Append molecule info to every atom
            atom_df = pd.DataFrame(atom_args, index=[0])

            if mol_nodes_df is None:
                mol_nodes_df = pd.DataFrame(columns=atom_df.columns)

            mol_nodes_df = pd.concat([mol_nodes_df, atom_df], ignore_index=True)

    for bond in mol.GetBonds():
        bond: rdchem.Bond

        bond_args = {
            'graph_id': index,
            'src_id': bond.GetBeginAtomIdx(),
            'dst_id': bond.GetEndAtomIdx(),
            # 'label': bond.GetBondType(),
            'bond_type': bond.GetBondTypeAsDouble()
        }

        atom_bonds_df = pd.DataFrame(bond_args, index=[0])

        if mol_bonds_df is None:
            mol_bonds_df = pd.DataFrame(columns=atom_bonds_df.columns)

        mol_bonds_df = pd.concat([mol_bonds_df, atom_bonds_df], ignore_index=True)

    # print(mol_bonds_df)
    # print(mol_nodes_df)

    global graphs_list
    global nodes_list
    global bonds_list

    graphs_list.append(mol_graph_df)
    nodes_list.append(mol_nodes_df)
    bonds_list.append(mol_bonds_df)

    # Clean up remaining files
    try:
        os.remove(f'{temp_path}/{index}_mol.ac')
        os.remove(f'{temp_path}/{index}_mol.pdb')
    except IOError as e:
        print(e)
        print('Something went wrong.')

In [7]:
def preprocess():
    global molecules
    global graphs_list
    global nodes_list
    global bonds_list

    init()

    """Do preprocessing."""
    print('Checking missing files..')

    # check_missing_files()

    print('Converting .xyz to mols..')

    dm.parallelized(
        xyz2mol,
        xyz_filepath_list,
        n_jobs=-1,
        progress=True,
        arg_type='arg',
        total=len(xyz_filepath_list)
    )

    print('Done!')

    print('Starting preprocessing..')

    dm.parallelized(
        preprocess_mol,
        molecules,
        n_jobs=1,
        progress=True,
        arg_type='arg',
        total=len(molecules)
    )

    graphs_df = pd.concat(graphs_list, ignore_index=True)
    nodes_df = pd.concat(nodes_list, ignore_index=True)
    bonds_df = pd.concat(bonds_list, ignore_index=True)

    graphs_df.to_csv(f'{graph_data_path}/graphs.csv', index=False)
    nodes_df.to_csv(f'{graph_data_path}/nodes_raw.csv', index=False)
    bonds_df.to_csv(f'{graph_data_path}/bonds.csv', index=False)

In [8]:
def main():
    preprocess()

    result_df = pd.read_csv('./graph_data/nodes_raw.csv')

    encoder = LabelEncoder()
    result_df['type'] = encoder.fit_transform(result_df['type'].to_numpy())
    result_df['label'] = encoder.fit_transform(result_df['label'].to_numpy())
    result_df.to_csv(f'{graph_data_path}/nodes.csv', index=False)

In [9]:
if __name__ == "__main__":
    if not in_ipython():
        root_dir = os.path.dirname(os.path.realpath(__file__))

        main()
    else:
        main()

total xyz filepath #  133885
Checking missing files..
Converting .xyz to mols..


  0%|          | 0/133885 [00:00<?, ?it/s]

Done!
Starting preprocessing..


  0%|          | 0/133885 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: './temp/1_mol.ac'