In [1]:
# Importing the libraries
from config import BASE_DIR
import numpy as np 
from numpy.linalg import norm
import import_ipynb 
import glob
import pickle
import pandas as pd  
import time
import csv 
 
from mol2vect import extract_vectorized_molecules, mass_featurize_sdf2vec,molvec_extractor
from sdf_batch_splitting import split_sdf_batch_in_compounds 
from bTree import Node
import cProfile

importing Jupyter notebook from mol2vect.ipynb
importing Jupyter notebook from sdf_batch_splitting.ipynb
importing Jupyter notebook from bTree.ipynb


In [2]:
def prepare_sdf_for_experiment():
    """THIS ROUTINE RUNS ALL THE NEEDED ROUTINES."""

    """BATCH SPLITTING ROUTINE"""
    #split_sdf_batch_in_compounds(25,f'{BASE_DIR}/TEST/BATCH/*.sdf',f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/')  

    """MOL2VEC FEATURIZATION ROUTINE""" 
    mass_featurize_sdf2vec(SDF_FILES_DIR=f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/',CSV_FILES_DIR = f'{BASE_DIR}/TEST/CSVS/') 

    """MOL2VEC VECTORIZATION"""
    extract_vectorized_molecules(INPUT_DIR=f'{BASE_DIR}/TEST/CSVS/', OUTPUT_CSV_FILENAME=f'{BASE_DIR}/TEST/vectors_list')
    
# create a dictionary for test compunds
def create_dict_cid_vect():
    cid2vect_dict = {} 

    csvarray = glob.glob(f'{BASE_DIR}/TEST/CSVS/'+'*.csv')

    print('POPULATING  LOOKUP-TABLE:')
    for csvfile in csvarray:
        #get the compound id from the csv filename:
        compid = int((csvfile.split('\\')[-1])[:-4])

        molvec = molvec_extractor(csvfile)
        cid2vect_dict[compid] = list(molvec)
 
    return cid2vect_dict
    
def get_Btree():
    # Get Btree from pkl,in this tree  , every node consist cluster_name [medoid_Cid,[cluster items],parent_cluster_cid,Child_posiotn_of _its_parent]
    # C581 [683, [516, 61, 683], 'C520', 'r'] 
    # C581 cluster has 3 item, its parent cluster is C520 and it is right child of its parent.
    infile = open(BASE_DIR+'/medoid_cluster_tree.pkl', 'rb')
    tree = pickle.load(infile)
    infile.close() 
    return tree

def vect2cid(vect,lookup_table):
    queryres = []

    queryvec = np.asarray(vect)

    for key in lookup_table:
        keyvec = np.asarray((lookup_table[key]))
        if np.array_equal(keyvec, queryvec):
            queryres = key
            break
    return queryres

 
def get_most_similar(dataset):
    print('Tree Search Result : ')
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        tree.search_relative_compound(new_compound,1)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))
    

In [3]:
def naive_search(dataset,all_compound):
    print('naive Search Result : ') 
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        get_relative_compound(new_compound,all_compound)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))

def get_relative_compound(new_compound,all_compound):
    dict_cid_dist = {}
    count = 0
    for cid in all_compound:
        compound = all_compound[cid]
        dict_cid_dist[cid] = norm(np.array(new_compound) - np.array(compound))
        count += 1
            
    min_key =  [key for key in  dict_cid_dist if  all( dict_cid_dist[temp] >=  dict_cid_dist[key] for temp in  dict_cid_dist)]
    print('relative close compound id is '+ str(min_key[0]))
    print('# of search : '+ str(count) + ', distance : '+ str(dict_cid_dist[min_key[0]]))
        

In [4]:
if __name__ == '__main__':
    print("Prepare sfd for Experiment")
    #prepare_sdf_for_experiment()
    

Prepare sfd for Experiment


In [5]:
lookup_table = create_dict_cid_vect()
tree = get_Btree()
# read test dataset from CSV
dataset = pd.read_csv(f'{BASE_DIR}/TEST/vectors_list.csv',header=None)
 
# load all_compound
infile = open(BASE_DIR+'/cid_vect_dict.pkl', 'rb')
all_compound = pickle.load(infile)
infile.close() 

POPULATING  LOOKUP-TABLE:


In [6]:
#cProfile.run('get_most_similar(dataset)')
get_most_similar(dataset)

Tree Search Result : 
cid 500002
relative close compound id is 282
# of search :13, distance : 29.22275093979305
cid 500003
relative close compound id is 237
# of search :10, distance : 65.54933902868672
cid 500004
relative close compound id is 298
# of search :9, distance : 54.53642942201103
cid 500006
relative close compound id is 662
# of search :10, distance : 65.0827508996173
cid 500007
relative close compound id is 298
# of search :9, distance : 60.37146860546234
cid 500008
relative close compound id is 662
# of search :10, distance : 71.27435404822633
cid 500009
relative close compound id is 698
# of search :11, distance : 32.68611952661175
cid 500010
relative close compound id is 328
# of search :12, distance : 35.76936585009621
cid 500012
relative close compound id is 91
# of search :13, distance : 39.479067516591414
cid 500013
relative close compound id is 556
# of search :12, distance : 40.766853999056174
cid 500014
relative close compound id is 514
# of search :8, distance 

In [7]:
naive_search(dataset,all_compound)

naive Search Result : 
cid 500002
relative close compound id is 282
# of search : 800, distance : 29.22275093979305
cid 500003
relative close compound id is 237
# of search : 800, distance : 65.54933902868672
cid 500004
relative close compound id is 254
# of search : 800, distance : 51.50468774812905
cid 500006
relative close compound id is 425
# of search : 800, distance : 60.3736623219511
cid 500007
relative close compound id is 425
# of search : 800, distance : 51.970978518571435
cid 500008
relative close compound id is 237
# of search : 800, distance : 66.26176074150915
cid 500009
relative close compound id is 698
# of search : 800, distance : 32.68611952661175
cid 500010
relative close compound id is 568
# of search : 800, distance : 23.917588602224583
cid 500012
relative close compound id is 254
# of search : 800, distance : 31.30383974043729
cid 500013
relative close compound id is 89
# of search : 800, distance : 37.22411968336167
cid 500014
relative close compound id is 251
# 