The lmdb file was used here, we will process the raw data to the .lmdb file first

In [1]:
from glob import glob
import sys
res2mol_path = '../../SurfGen/'
sys.path.append(res2mol_path)

import os
import pickle
import lmdb
import torch
from torch.utils.data import Dataset
from tqdm.auto import tqdm
import os.path as osp
from rdkit import Chem
from utils.datasets.surfdata import SurfGenDataset
from utils.transforms import *
from utils.misc import *

def read_sdf(sdf_file, sanitize=False):
    supp = Chem.SDMolSupplier(sdf_file, sanitize=sanitize)
    mols_list = [i for i in supp]
    return mols_list
def write_pkl(list,file):
    with open(file,'wb') as f:
        pickle.dump(list,f)
        print('pkl file saved at {}'.format(file))
def write_sdf(mol_list,file):
    writer = Chem.SDWriter(file)
    for i in mol_list:
        writer.write(i)
    writer.close()

import torch
from plyfile import PlyData
from torch_geometric.data import Data
from torch_geometric.transforms import FaceToEdge, Cartesian
from utils.data import ProteinLigandData, torchify_dict
from utils.protein_ligand import parse_sdf_file, parse_rdmol

def read_ply(path, read_face=None):
    with open(path, 'rb') as f:
        data = PlyData.read(f)

    features = ([torch.tensor(data['vertex'][axis.name]) for axis in data['vertex'].properties if axis.name not in ['nx', 'ny', 'nz'] ])
    pos = torch.stack(features[:3], dim=-1)
    features = torch.stack(features[3:], dim=-1)
    
    if 'face' in data:
        faces = data['face']['vertex_indices']
        faces = [torch.tensor(fa, dtype=torch.long) for fa in faces]
        face = torch.stack(faces, dim=-1)

    #data = Data(x=features, pos=pos, face=face)
    data = {'feature':features,\
        'pos':pos}
    return data

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default='../configs/train_surflinker_moad_oncross.yml')
parser.add_argument('--device', type=str, default='cuda')
parser.add_argument('--base_path', type=str, default='/home/haotian/molecules_confs/Protein_test/SurfGen')
args = parser.parse_args([])
config = load_config(args.config)
config_name = os.path.basename(args.config)[:os.path.basename(args.config).rfind('.')]
seed_all(config.train.seed)

In [6]:
protein_featurizer = FeaturizeProteinAtom()
ligand_featurizer = FeaturizeLigandAtom()
masking = get_mask(config.train.transform.mask)
composer = AtomComposer(protein_featurizer.feature_dim, ligand_featurizer.feature_dim, config.model.encoder.knn)

edge_sampler = EdgeSample(config.train.transform.edgesampler)
cfg_ctr = config.train.transform.contrastive
contrastive_sampler = ContrastiveSample(cfg_ctr.num_real, cfg_ctr.num_fake, cfg_ctr.pos_real_std, cfg_ctr.pos_fake_std, config.model.field.knn)
transform = Compose([
    RefineData(),
    LigandCountNeighbors(),
    protein_featurizer,
    ligand_featurizer,
    masking,
    composer,

    FocalBuilder(),
    edge_sampler,
    contrastive_sampler,
])

In [2]:
import pickle
def read_pkl(file):
    with open(file,'rb') as f:
        data = pickle.load(f)
    return data
def write_pkl(list,file):
    with open(file,'wb') as f:
        pickle.dump(list,f)
        print('pkl file saved at {}'.format(file))

### filter the molecules

In [3]:
import shutil
def check_atom_type(mol):
    flag=True
    for atom in mol.GetAtoms():
        atomic_number = atom.GetAtomicNum()
        if int(atomic_number) not in [6,7,8,9,15,16,17]:
            flag=False
            break
    return flag

transer_path = ''
atomic_numbers = [6,7,8,9,15,16,17]
ligfiles = glob('/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad/*.sdf')
for ligfile in tqdm(ligfiles):
    mol = read_sdf(ligfile)[0]
    flag = check_atom_type(mol)
    if flag:
        shutil.copy(ligfile,'/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad/all_ligand')

  0%|          | 0/36532 [00:00<?, ?it/s]

### Remove All Hydrogens in molecules, because the molecular generation should focus on the heavy atoms.

In [9]:
new_lig_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/new_lig_path'
ligfiles = glob('/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/*.sdf')
for ligfile in tqdm(ligfiles):
    try:    
        mol = read_sdf(ligfile)[0]
        name = ligfile.split('/')[-1]
        mol = Chem.RemoveAllHs(mol)
        flag = check_atom_type(mol)
        if flag is not True:
            print(ligfile)
        write_sdf([mol],new_lig_path+'/'+name)
    except:
        print(ligfile)

  0%|          | 0/19307 [00:00<?, ?it/s]

[02:52:41] Explicit valence for atom # 20 O, 3, is greater than permitted


/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/5tdm_3.sdf


[02:52:44] Explicit valence for atom # 9 O, 3, is greater than permitted
[02:52:44] Explicit valence for atom # 1 O, 5, is greater than permitted
[02:52:45] Explicit valence for atom # 10 O, 3, is greater than permitted


/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/3ilr_1.sdf
/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/3nu3_1.sdf
/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/1c39_0.sdf


[02:52:48] Explicit valence for atom # 5 O, 3, is greater than permitted
[02:52:49] Explicit valence for atom # 3 O, 4, is greater than permitted
[02:52:50] Explicit valence for atom # 2 C, 5, is greater than permitted
[02:52:51] Explicit valence for atom # 12 O, 3, is greater than permitted


/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/6cig_1.sdf
/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/11bg_0.sdf
/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/4mty_2.sdf
/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/2gyu_0.sdf


### create index file ,the index file is the pickle file, [(protein_filename, lig_filename) *n]

In [6]:
raw_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'
index_path = os.path.join(raw_path, 'index.pkl')

In [8]:
moad_index = []
ligfiles = glob('/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/*.sdf')
for ligfile in tqdm(ligfiles):
    file_name = ligfile.split('/')[-1]
    ply_file = file_name[:6] + '_pocket_8.0_res_1.5.ply'
    moad_index.append((ply_file,file_name,file_name[:4]+'_protein.pdb'))

  0%|          | 0/35739 [00:00<?, ?it/s]

In [9]:
write_pkl(moad_index, index_path)

pkl file saved at /home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter/index.pkl


### create lmdb and lmdb-lock file

In [10]:
raw_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'
index_path = os.path.join(raw_path, 'index.pkl')
processed_path = os.path.join(os.path.dirname(raw_path), os.path.basename(raw_path) + '_mol.lmdb')
name2id_path = os.path.join(os.path.dirname(raw_path), os.path.basename(raw_path) + '_molname2id.pt')

In [11]:
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

In [12]:
surf_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_surface_8'
base_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'
lig_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter'

db = lmdb.open(
    processed_path,
    map_size=10*(1024*1024*1024),   # 10GB
    create=True,
    subdir=False,
    readonly=False, # Writable
)
with open(index_path, 'rb') as f:
    index = pickle.load(f)
num_skipped = 0
with db.begin(write=True, buffers=True) as txn:
    for i, (pocket_fn, ligand_fn, protein_fn ) in enumerate(tqdm(index)):
        if pocket_fn is None: 
            continue
        try:
            sdf_file = osp.join(lig_path,index[i][1])
            ply_file = osp.join(surf_path,index[i][0])

            pocket_dict = read_ply(ply_file)
            ligand_dict = parse_sdf_file(sdf_file)
            data = ProteinLigandData.from_protein_ligand_dicts(
                protein_dict=torchify_dict(pocket_dict),
                ligand_dict=torchify_dict(ligand_dict),
            )
            data.protein_filename = index[i][2]
            data.ligand_filename = index[i][1]
            data.surface_filename = index[i][0]
            data.mol = ligand_dict['mol']
            txn.put(
                key = str(i).encode(),
                value = pickle.dumps(data)
            )
        except:
            num_skipped += 1
            if num_skipped%100 == 0:
                print('Skipping (%d) %s' % (num_skipped, ligand_fn, ))
            continue
db.close()

  0%|          | 0/35739 [00:00<?, ?it/s]

In [13]:
db = lmdb.open(
            processed_path,
            map_size=5*(1024*1024*1024),   # 5GB
            create=False,
            subdir=False,
            readonly=True,
            lock=False,
            readahead=False,
            meminit=False,
        )
with db.begin() as txn:
    keys = list(txn.cursor().iternext(values=False))

In [14]:
name2id = {}
for i in tqdm(range(len(keys)), 'Indexing'):
    try:
        key = keys[i]
        data = pickle.loads(db.begin().get(key))
        data.id = i
        assert data.protein_pos.size(0) > 0
    except AssertionError as e:
        print(i,e)
        continue
    name = (data.surface_filename, data.ligand_filename)
    name2id[name] = i

Indexing:   0%|          | 0/35672 [00:00<?, ?it/s]

In [15]:
torch.save(name2id, name2id_path)

### create split file, {'train':[(protein_file, lig_file)*n], 'test':[(protein_file, lig_file)*n]}. Combined with the index.pkl, we can split file as train and test set. 

In [16]:
surf_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_surface_8'
base_path = '/home/haotian/molecules_confs/Protein_test/SurfGen/data/bindingmoad_filter2'

split = {}
split['train'] = []
split['test'] = []
split['all'] = list(name2id.keys())

test_raw = []
sdffiles = glob('/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad/test_newpdb/*.sdf')
for ligfile in sdffiles:
    name = ligfile.split('/')[-1][:6]
    pdbid = name[:4]
    surffile = name + '_pocket_8.0_res_1.5.ply'
    sdffile = ligfile.split('/')[-1]
    test_name = (surffile, sdffile)
    test_raw.append(test_name)

In [17]:
for all_name in split['all']:
    flag = False
    for test_name in test_raw:
        if test_name == all_name:
            flag = True
    if flag:
        split['test'].append(all_name)
    if not flag:
        split['train'].append(all_name)

In [18]:
len(split['train'])

35516

In [19]:
len(split['test'])

156

In [20]:
len(split['all'])

35672

In [21]:
len(split['train']) + len(split['test'])

35672

In [22]:
torch.save(split, '/home/haotian/molecules_confs/Protein_test/SurfGen/data/moad_filter_split_by_name.pt')

In [27]:
mol = read_sdf(ligfiles[0])[0]

In [30]:
index = read_pkl('/home/haotian/molecules_confs/Protein_test/Res2mol/data/crossdocked_pocket10/index.pkl')

In [38]:
i = 0
base_path = '/home/haotian/molecules_confs/Protein_test/Res2mol/data/crossdocked_pocket10'
ligfile = osp.join(base_path,index[i][1])
mol = read_sdf(ligfile)[0]