In [1]:
import h5py
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

In [14]:
output_file = "hdpy-bind-pdbbind.h5"
root_data_dir = "/g/g13/jones289/workspace/hd-cuda-master/datasets/"
deepchem_aa_feat_path = Path(root_data_dir + "deepchem_features/deepchem_baseline_feats.pkl")
fusion_ml_3dcnn_feat_path = Path(root_data_dir + "fusion_ml/3dcnn/3dcnn_data.pkl")
fusion_ml_sgcnn_feat_path = Path(root_data_dir + "/fusion_ml/sgcnn/sgcnn_data.pkl")
fusion_ml_fusion_feat_path = Path(root_data_dir + "/fusion_ml/fusion/fusion_data.pkl")
pdbbind_2016_fps_feat_path = Path(root_data_dir + "/pdbbind_fingerprints/pdbbind_2016_fps_new/v_2016_refined_pdbid_list.csv")

In [15]:
# pdbbind_meta_df = pd.concat([pd.read_csv("/g/g13/jones289/workspace/hd-cuda-master/pdbbind_2016_train_metadata.csv", index_col=0),
#                             pd.read_csv("/g/g13/jones289/workspace/hd-cuda-master/pdbbind_2016_test_metadata.csv")])
pdbbind_meta_df = pd.read_csv("/p/lustre2/jones289/data/pdbbind/metadata/pdbbind_v2019_metadata_no_duplicates.csv")
pdbbind_meta_df

Unnamed: 0.1,Unnamed: 0,index,file,pdbid,-logKd/Ki,set,name,bind
0,0,0,v2019/3v2n/3v2n_ligand.mol2,3v2n,6.10,train,3v2n,2
1,1,1,v2019/5i43/5i43_ligand.mol2,5i43,7.20,train,5i43,2
2,2,2,v2019/5ot9/5ot9_ligand.mol2,5ot9,4.07,train,5ot9,0
3,3,4,v2019/5zkc/5zkc_ligand.mol2,5zkc,8.19,train,5zkc,1
4,4,5,v2019/2bgn/2bgn_ligand.mol2,2bgn,5.70,train,2bgn,0
...,...,...,...,...,...,...,...,...
17628,17628,18451,v2019/1qxy/1qxy_ligand.mol2,1qxy,4.80,train,1qxy,0
17629,17629,18452,v2019/6eo9/6eo9_ligand.mol2,6eo9,6.80,train,6eo9,2
17630,17630,18453,v2019/5hjc/5hjc_ligand.mol2,5hjc,4.35,train,5hjc,0
17631,17631,18454,v2019/4anq/4anq_ligand.mol2,4anq,6.22,val,4anq,2


In [16]:
affinity_map = {x['pdbid']: x['-logKd/Ki'] for idx, x in pdbbind_meta_df.iterrows()}

In [17]:
affinity_map

{'3v2n': 6.1,
 '5i43': 7.2,
 '5ot9': 4.07,
 '5zkc': 8.19,
 '2bgn': 5.7,
 '5ncz': 9.1,
 '6e8m': 6.28,
 '4y46': 7.96,
 '4ipj': 4.87,
 '5mxv': 4.92,
 '5f4r': 7.02,
 '1k06': 5.34,
 '5j1x': 5.52,
 '1q6m': 7.89,
 '4n9d': 7.72,
 '5ypo': 7.1,
 '3fv2': 8.11,
 '1akq': 9.1,
 '5boy': 5.6,
 '4n7j': 5.47,
 '5am5': 5.74,
 '2ya8': 5.75,
 '2k7l': 6.15,
 '2p83': 7.96,
 '4jbo': 7.62,
 '4qsw': 1.68,
 '2vx9': 7.44,
 '3qri': 9.1,
 '5nt4': 7.77,
 '5e80': 8.3,
 '3dst': 9.08,
 '2mpa': 8.04,
 '4w4z': 7.52,
 '5mo8': 5.79,
 '4dum': 7.02,
 '5yyf': 6.48,
 '3hqy': 7.94,
 '5wic': 4.64,
 '5oxl': 3.25,
 '5tw3': 8.7,
 '4kn7': 7.62,
 '3qw5': 6.45,
 '4y8c': 7.96,
 '4jyt': 7.4,
 '2nn1': 5.82,
 '6cmr': 6.61,
 '4qok': 5.19,
 '2xjg': 7.82,
 '3t2t': 2.4,
 '1oif': 7.72,
 '4kww': 5.55,
 '4c71': 3.42,
 '1g7g': 6.6,
 '1r6z': 5.0,
 '4x8n': 6.62,
 '6g6y': 6.66,
 '3sut': 7.85,
 '5yls': 5.29,
 '2vtl': 4.01,
 '6msy': 7.0,
 '5csh': 3.57,
 '5vl2': 8.42,
 '2p3a': 7.96,
 '4y6m': 7.15,
 '2xcn': 6.92,
 '2zva': 7.96,
 '3sl4': 7.57,
 '2g94': 9

In [18]:
def load_deepchem_feats(feat_path):
    with open(feat_path, 'rb') as handle:
        data = pickle.load(handle)
        return data

def load_ml_fusion_feats(feat_path):
#     '''
    with open(feat_path, 'rb') as handle:
        x_train, y_train, x_test, y_test = pickle.load(handle)
        
        
        meta_train_df = pd.read_csv(feat_path.with_name("pdbbind_2016_train_metadata.csv"), 
                                   index_col=0)
        meta_test_df = pd.read_csv(feat_path.with_name("pdbbind_2016_test_metadata.csv"),
                                  index_col=0)
        
        
        
        train_data_dict = {pdbid: x_train[idx, :] for idx, pdbid in zip(list(range(len(meta_train_df))),meta_train_df['ligand_id'])}
        test_data_dict ={pdbid: x_test[idx, :] for idx, pdbid in zip(list(range(len(meta_test_df))),meta_test_df['ligand_id'])}

        
        data_dict = train_data_dict
        data_dict.update(test_data_dict)
        return data_dict
    
def load_fp_feats(feat_path):
    df = pd.read_csv(feat_path, index_col=0)
    fps = df[[str(x) for x in range(1024)]].values
    fp_dict = {key: value for key,value in zip(df['pdbid'].values, fps)}
    return fp_dict

In [19]:
deepchem_aa_feat_dict = load_deepchem_feats(deepchem_aa_feat_path)
fusion_ml_3dcnn_feat_dict = load_ml_fusion_feats(fusion_ml_3dcnn_feat_path)
fusion_ml_sgcnn_feat_dict = load_ml_fusion_feats(fusion_ml_sgcnn_feat_path)
fusion_ml_fusion_feat_dict = load_ml_fusion_feats(fusion_ml_fusion_feat_path)
fp_feat_dict = load_fp_feats(pdbbind_2016_fps_feat_path)

In [20]:
def write_feats(dataset_name, data_dict):
    with h5py.File(output_file, 'a') as f:
        for pdbid in tqdm(data_dict, 
                          desc=f"writing {dataset_name} feats.."):
            
            if pdbid not in affinity_map.keys():
                continue
                
            pdbid_group = f.require_group(pdbid)
            pdbid_group.attrs['affinity'] = affinity_map[pdbid]
        
            data = data_dict[pdbid]
            pdbid_group.require_dataset(dataset_name,
                                   data=data,
                                    shape=data.shape,
                                    dtype=np.float32
                                   )

In [21]:
write_feats("deepchem-aa-feat", deepchem_aa_feat_dict)

writing deepchem-aa-feat feats..: 100%|██████████| 17652/17652 [00:12<00:00, 1389.70it/s]


In [22]:
write_feats("fusion-3dcnn-feat", fusion_ml_3dcnn_feat_dict)

writing fusion-3dcnn-feat feats..: 100%|██████████| 3680/3680 [00:03<00:00, 1167.90it/s]


In [23]:
write_feats("fusion-sgcnn-feat", fusion_ml_sgcnn_feat_dict)

writing fusion-sgcnn-feat feats..: 100%|██████████| 3680/3680 [00:03<00:00, 1204.30it/s]


In [24]:
write_feats("fusion-feat", fusion_ml_fusion_feat_dict)

writing fusion-feat feats..: 100%|██████████| 3680/3680 [00:03<00:00, 1011.94it/s]


In [25]:
write_feats("ecfp-fingerprints",fp_feat_dict)

writing ecfp-fingerprints feats..: 100%|██████████| 3465/3465 [00:02<00:00, 1185.81it/s]
