In [None]:
from tqdm import tqdm
import glob
import os
from deepchem.feat import RdkitGridFeaturizer
import oddt
from joblib import Parallel,delayed
from oddt.fingerprints import PLEC
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import scipy
import deepchem as dc

# PLEC 

In [None]:
protein = next(oddt.toolkit.readfile('pdb', '/home/pathway/protein.pdb'))

def parallel_plec(lig):
    print(f"process ligand：{lig}")
    ligand = next(oddt.toolkit.readfile('sdf', lig))
    if ligand is None:
        print(f"Unable to read ligand file：{lig}")
        return None   
    feature = PLEC(ligand, protein=protein, size=4092, 
                   depth_protein=4, depth_ligand=2,
                   distance_cutoff=4.5, sparse=False)
    return feature

### Train_PLEC

In [None]:
os.chdir('/home/pathway/Train_data/')
train_sdf = glob.glob('*.sdf')
train_sdf.sort(key=lambda x: int(''.join(filter(str.isdigit, x))))
train_sdf

In [None]:
%time
train_plec = Parallel(n_jobs = 16, backend = "multiprocessing")(delayed(parallel_plec)(sdf) for sdf in tqdm(train_sdf))

In [None]:
train_plec_filtered=[arr for arr in train_plec if arr is not None]
print(len(train_plec_filtered))

In [None]:
train_plec_matrix=np.array(train_plec_filtered)
np.savetxt('/home/pathway/train_plec.txt',train_plec_matrix)

### Test_PLEC

In [None]:
os.chdir('/home/pathway/Test_data/')
test_sdf = glob.glob('*.sdf')
test_sdf.sort(key=lambda x: int(''.join(filter(str.isdigit, x))))
test_sdf

In [None]:
%time
test_plec = Parallel(n_jobs = 16, backend = "multiprocessing")(delayed(parallel_plec)(sdf) for sdf in tqdm(test_sdf))

In [None]:
test_plec_filtered=[arr for arr in test_plec if arr is not None]
print(len(test_plec_filtered))

In [None]:
test_plec_matrix=np.array(test_plec_filtered)
np.savetxt('/home/pathway/test_plec.txt',test_plec_matrix)

# ConvMol

### Excute the following code for the training set and test set respectively.

In [None]:
mols=[]
for sdf in train_sdf: # or 'for sdf in test_sdf':
    suppl=Chem.SDMolSupplier(sdf)
    for mol in suppl:
        if mol is not None:
            mols.append(mol)

In [None]:
featurizer_convmol=dc.feat.ConvMolFeaturizer()
X=featurizer_convmol.featurize(mols)

In [None]:
adjacency_matrices=[]
node_features=[]
for i,convmol in enumerate(X):
    k=convmol.get_adjacency_list()
    k_=np.zeros((len(k),len(k)))
    for j in range(len(k)):
        for l in k[j]:
            k_[j,l]=1
    adjacency_matrices.append(k_)

    atom_features=convmol.get_atom_features()
    node_features.append(atom_features)
    
    print(f'Molecule {i} adjancency dimensions:{k_.shape}')
    print(f'Molecule {i} atom features dimensions:{atom_features.shape}')

In [None]:
np.savez('/home/pathway/train_ConvMol.npz',adjacency_matrices=adjacency_matrices,node_features=node_features)
#np.savez('/home/pathway/test_ConvMol.npz',adjacency_matrices=adjacency_matrices,node_features=node_features)