The lmdb file was used here, we will process the raw data to the .lmdb file first

In [3]:
import lmdb
import pickle
import os
import sys
from tqdm.auto import tqdm
res2mol_path = '../../Res2Mol/'
sys.path.append(res2mol_path)

In [8]:
from utils.protein_ligand import PDBProtein, parse_sdf_file
from utils.data import ProteinLigandData, torchify_dict
from utils.feats.protein import process_PDB_v2

In [None]:
raw_path = './crossdocked_pocket10'
index_path = os.path.join(raw_path, 'index.pkl')
processed_path = os.path.join(os.path.dirname(raw_path), os.path.basename(raw_path) + '_processed.lmdb')
db = lmdb.open(
    processed_path,
    map_size=10*(1024*1024*1024),   # 10GB
    create=True,
    subdir=False,
    readonly=False, # Writable
)

with open(index_path, 'rb') as f:
    index = pickle.load(f)

num_skipped = 0
with db.begin(write=True, buffers=True) as txn:
    for i, (pocket_fn, ligand_fn, _, rmsd_str) in enumerate(tqdm(index)):
        if pocket_fn is None: continue
        try:
            pocket_dict = process_PDB_v2(os.path.join(raw_path, pocket_fn))
            ligand_dict = parse_sdf_file(os.path.join(raw_path, ligand_fn))
            data = ProteinLigandData.from_protein_ligand_dicts(
                protein_dict=torchify_dict(pocket_dict),
                ligand_dict=torchify_dict(ligand_dict),
            )
            data.protein_filename = pocket_fn
            data.ligand_filename = ligand_fn
            txn.put(
                key = str(i).encode(),
                value = pickle.dumps(data)
            )
        except:
            num_skipped += 1
            if num_skipped % 1000 == 0:
                print('skipping {}'.format(num_skipped))
            #print('Skipping (%d) %s' % (num_skipped, ligand_fn, ))
            continue
db.close()