In [None]:
import pickle
import os
import os.path as osp
from rdkit import Chem
from rdkit.Chem import AllChem 
from rdkit.Chem.rdMolAlign import CalcRMS
import numpy as np
from rdkit.Chem import rdMMPA
import sys
sys.path.append('/home/haotian/molecules_confs/Protein_test/SurfGen')

from utils.chem import remove_dummys_mol, mol_with_atom_index, transfer_coord, linkerize_mol, fragmentize_mol
from utils.chem import check_frags, check_frag, Murcko_decompose, combine_mols

def read_sdf(sdf_file, sanitize=False):
    supp = Chem.SDMolSupplier(sdf_file, sanitize=sanitize)
    mols_list = [i for i in supp]
    return mols_list
    
def qsmi(smi):
    return Chem.MolFromSmiles(smi)

from rdkit.Chem import rdmolfiles
def write_sdf(mol_list,file):
    writer = Chem.SDWriter(file)
    rdmolfiles.SDWriter.SetKekulize(writer, False)
    for i in mol_list:
        writer.write(i)
    writer.close()

def read_pkl(file):
    with open(file,'rb') as f:
        data = pickle.load(f)
    return data
def write_pkl(list,file):
    with open(file,'wb') as f:
        pickle.dump(list,f)
        print('pkl file saved at {}'.format(file))

In [None]:
import torch
from copy import deepcopy
import shutil
from tqdm.auto import tqdm
from utils.transforms import *
from utils.misc import load_config
from utils.reconstruct import *
from utils.datasets.surfdata import SurfGenDataset
from models.surfgen import SurfGen
from utils.protein_ligand import read_ply, parse_rdmol
from utils.sample import get_init, get_next, logp_to_rank_prob
from utils.sample import STATUS_FINISHED, STATUS_RUNNING
from utils.data import torchify_dict

def pdb_to_pocket_data(ply_file, keep_frag):
    '''
    use the sdf_file as the center 
    '''
    protein_dict = read_ply(ply_file)
    keep_frag_mol = read_sdf(keep_frag)[0]
    Chem.SanitizeMol(keep_frag_mol)
    ligand_dict = parse_rdmol(keep_frag_mol)
    data = ProteinLigandData.from_protein_ligand_dicts(
        protein_dict = torchify_dict(protein_dict),
        ligand_dict = torchify_dict(ligand_dict)
    )
    return data

 ### Extract scaffold for crossdock molecules
 If you want to submit the scaffold hopping task to the Delete, as least you should mark the scaffold in the moelcules at first. (Or provide the user-defined structures).
 For simplicity, we extract the scaffold from BindingMoad test split for further understanding of data preparation part.

In [17]:
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

draw2d = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/draw2d'
# define the test split part 
split = torch.load('/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_filter_split_by_name.pt')
# where the ligand file locate 
ligand_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'
# where the surface file locate
surface_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_surface_8'
# transform_path is the target path 
transform_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds'
index = []
all_mols = []
kept_frags = []
growing_frags = []
os.makedirs(osp.join(transform_path,'frag_surface'), exist_ok=True)
os.makedirs(osp.join(transform_path,'frag_mols'),exist_ok=True)
os.makedirs(osp.join(transform_path,'frag_fragments'),exist_ok=True)
os.makedirs(osp.join(transform_path,'draw2d'),exist_ok=True)

for surf_nm, ligand_nm in split['test']:
    ligand_file = osp.join(ligand_path, ligand_nm)
    surface_file = osp.join(surface_path,surf_nm)
    file_name = ligand_nm.split('/')[-1].split('.')[0] 
    mol = read_sdf(ligand_file)[0]
    Chem.SanitizeMol(mol)
    try:
        scaffold, side_chains = Murcko_decompose(mol)
    except:
        continue
    if len(side_chains) == 0:
        continue
    all_mols.append(mol)
    side_chains = combine_mols(side_chains)
    kept_frags.append(side_chains)
    growing_frags.append(scaffold)

    shutil.copy(osp.join(surface_path,surf_nm),osp.join(transform_path,'frag_surface'))
    write_sdf([mol],osp.join(transform_path,'frag_mols',f'{file_name}.sdf'))
    write_sdf([side_chains],osp.join(transform_path,'frag_fragments',f'{file_name}.sdf'))
    Chem.Draw.MolToFile(mol, osp.join(draw2d,f'{file_name}.png'))
    Chem.Draw.MolToFile(side_chains, osp.join(draw2d,f'{file_name}_kept.png'))

    index.append((surf_nm, ligand_nm,f'{file_name}.sdf'))

write_sdf(all_mols,osp.join(transform_path,'frag_mols.sdf'))
write_sdf(kept_frags,osp.join(transform_path,'frag_fragments.sdf'))
write_pkl(index, osp.join(transform_path,'frag_index.pkl'))


### make all the test data for further generation
all_data = []
for surf_fn, lig_fn, frag_fn in index:
    surf_file = osp.join(surface_path, surf_fn)
    frag_file = osp.join(transform_path,'frag_fragments',frag_fn)
    try:
        data = pdb_to_pocket_data(surf_file,frag_file)
        data.surf_fn = surf_fn
        data.lig_fn = lig_fn
        data.frag_fn = frag_fn
        all_data.append(data)
    except:
        print('failed parse rdmol {}'.format(osp.join(transform_path,'frag_fragments', frag_fn)))
        continue
write_pkl(all_data, osp.join(transform_path,'index_data.pkl'))
print('Merko Scaffold',len(all_data))

pkl file saved at /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_index.pkl
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/3eks_1.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/3eku_1.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/2x2r_5.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/2i3i_0.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/2x2r_1.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/2x2r_3.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaffolds/frag_fragments/2x7d_1.sdf
failed parse rdmol /home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_scaf

In [None]:
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

draw2d = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_sidechains/draw2d'
# define the test split part 
split = torch.load('/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_filter_split_by_name.pt')
# where the ligand file locate 
ligand_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'
# where the surface file locate
surface_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_surface_8'
# transform_path is the target path 
transform_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_sidechains'
index = []
all_mols = []
kept_frags = []
growing_frags = []
os.makedirs(osp.join(transform_path,'frag_surface'), exist_ok=True)
os.makedirs(osp.join(transform_path,'frag_mols'),exist_ok=True)
os.makedirs(osp.join(transform_path,'frag_fragments'),exist_ok=True)
os.makedirs(osp.join(transform_path,'draw2d'),exist_ok=True)

for surf_nm, ligand_nm in split['test']:
    ligand_file = osp.join(ligand_path, ligand_nm)
    surface_file = osp.join(surface_path,surf_nm)
    file_name = ligand_nm.split('/')[-1].split('.')[0] 
    mol = read_sdf(ligand_file)[0]
    Chem.SanitizeMol(mol)
    try:
        scaffold, side_chains = Murcko_decompose(mol)
    except:
        continue
    if len(side_chains) == 0:
        continue
    all_mols.append(mol)
    side_chains = combine_mols(side_chains)
    kept_frags.append(side_chains)
    growing_frags.append(scaffold)

    shutil.copy(osp.join(surface_path,surf_nm),osp.join(transform_path,'frag_surface'))
    write_sdf([mol],osp.join(transform_path,'frag_mols',f'{file_name}.sdf'))
    write_sdf([scaffold],osp.join(transform_path,'frag_fragments',f'{file_name}.sdf'))
    Chem.Draw.MolToFile(mol, osp.join(draw2d,f'{file_name}.png'))
    Chem.Draw.MolToFile(scaffold, osp.join(draw2d,f'{file_name}_kept.png'))

    index.append((surf_nm, ligand_nm,f'{file_name}.sdf'))

write_sdf(all_mols,osp.join(transform_path,'frag_mols.sdf'))
write_sdf(kept_frags,osp.join(transform_path,'frag_fragments.sdf'))
write_pkl(index, osp.join(transform_path,'frag_index.pkl'))


### make all the test data for further generation
all_data = []
for surf_fn, lig_fn, frag_fn in index:
    surf_file = osp.join(surface_path, surf_fn)
    frag_file = osp.join(transform_path,'frag_fragments',frag_fn)
    try:
        data = pdb_to_pocket_data(surf_file,frag_file)
        data.surf_fn = surf_fn
        data.lig_fn = lig_fn
        data.frag_fn = frag_fn
        all_data.append(data)
    except:
        print('failed parse rdmol {}'.format(osp.join(transform_path,'frag_fragments', frag_fn)))
        continue
write_pkl(all_data, osp.join(transform_path,'index_data.pkl'))
print(len(all_data))