In [1]:
# Importing the libraries
from config import BASE_DIR
import numpy as np 
from numpy.linalg import norm
import import_ipynb 
import glob
import pickle
import pandas as pd  
import time
import csv 
 
from mol2vect import extract_vectorized_molecules, mass_featurize_sdf2vec,molvec_extractor
from sdf_batch_splitting import split_sdf_batch_in_compounds 
from bTree import Node
import cProfile

importing Jupyter notebook from mol2vect.ipynb
importing Jupyter notebook from sdf_batch_splitting.ipynb
importing Jupyter notebook from bTree.ipynb


In [2]:
# prepare_sdf_for_experiment () : it has three sub function
# 1. split_sdf_batch_in_compounds () -> it read a batch sdf file from folder TEST/BATCH, extract 25 compounds sdf file, store them in TEST/SDF_PER_COMPOUND/
# we will use these 25 compound as a new compound for our project
# mass_featurize_sdf2vec() -> sdf to vec representation for all objects within a directory
# extract_vectorized_molecules() ->  reads in all csv files with smiles and mol2vec information from a given directory, 
# create a csv file with only the vectors, return a dictionary with compound id as key and vector as values  

def prepare_sdf_for_experiment():
    """THIS ROUTINE RUNS ALL THE NEEDED ROUTINES."""

    """BATCH SPLITTING ROUTINE"""
    split_sdf_batch_in_compounds(25,f'{BASE_DIR}/TEST/BATCH/*.sdf',f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/')  

    """MOL2VEC FEATURIZATION ROUTINE""" 
    mass_featurize_sdf2vec(SDF_FILES_DIR=f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/',CSV_FILES_DIR = f'{BASE_DIR}/TEST/CSVS/') 

    """MOL2VEC VECTORIZATION"""
    cid2vect_dict = extract_vectorized_molecules(INPUT_DIR=f'{BASE_DIR}/TEST/CSVS/', OUTPUT_CSV_FILENAME=f'{BASE_DIR}/TEST/vectors_list')
    return cid2vect_dict

In [3]:
def get_Btree():
    # Get tree from pkl,in this tree  , every node consist cluster_name [medoid_Cid,[cluster items],parent_cluster_cid,Child_posiotn_of _its_parent]
    # C581 [683, [516, 61, 683], 'C520', 'r'] 
    # C581 cluster has 3 item, its parent cluster is C520 and it is right child of its parent.
    infile = open(BASE_DIR+'/medoid_cluster_tree.pkl', 'rb')
    tree = pickle.load(infile)
    infile.close() 
    return tree

In [4]:
# vect2cid()-> fetch cid for a given vector
# input : query vector, dictionary 
# output : Return corresponding compound Id
def vect2cid(vect,lookup_table):
    queryres = []
    queryvec = np.asarray(vect)

    for key in lookup_table:
        keyvec = np.asarray((lookup_table[key]))
        if np.array_equal(keyvec, queryvec):
            queryres = key
            break
    return queryres

In [5]:
# get_most_similar() -> search relative close compound from tree, here dataset has 25 compounds vector information for test, 
# for each compound it find relative comdound from tree, 
# input : dataset of vectors
# output : relative close compound to query vector, total and average searching time
def get_most_similar(dataset):
    print('Tree Search Result : ')
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        tree.search_relative_compound(new_compound,1)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))
    

In [6]:
# naive_search()-> 
# input: list of test compounds,dictionary of all compounds 
# output: total execution time, average execution time
def naive_search(dataset,all_compound):
    print('naive Search Result : ') 
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        get_relative_compound(new_compound,all_compound)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))

# get_relative_compound()-> It checks with all compound from dataset.
# input: new compound, dictionary of all compounds 
# output: relative compound, distance between new compound and found compound
def get_relative_compound(new_compound,all_compound):
    dict_cid_dist = {}
    count = 0
    for cid in all_compound:
        compound = all_compound[cid]
        dict_cid_dist[cid] = norm(np.array(new_compound) - np.array(compound))
        count += 1
            
    min_key =  [key for key in  dict_cid_dist if  all( dict_cid_dist[temp] >=  dict_cid_dist[key] for temp in  dict_cid_dist)]
    print('relative close compound id is '+ str(min_key[0]))
    print('# of search : '+ str(count) + ', distance : '+ str(dict_cid_dist[min_key[0]]))
        

In [7]:
if __name__ == '__main__':
    print("Prepare sfd for Experiment")
    lookup_table = prepare_sdf_for_experiment()
    

Prepare sfd for Experiment
[92mBatch Splitting Completed![0m


In [8]:
tree = get_Btree()
# read test dataset from CSV
dataset = pd.read_csv(f'{BASE_DIR}/TEST/vectors_list.csv',header=None)
 
# load all_compound
infile = open(BASE_DIR+'/cid_vect_dict.pkl', 'rb')
all_compound = pickle.load(infile)
infile.close() 

In [9]:
#cProfile.run('get_most_similar(dataset)')
get_most_similar(dataset)

Tree Search Result : 
cid 500002
relative close compound id is 3
# of search :4, distance : 30.674774528064418
cid 500003
relative close compound id is 20
# of search :5, distance : 82.000047818361
cid 500004
relative close compound id is 20
# of search :5, distance : 60.492125924996415
cid 500006
relative close compound id is 20
# of search :5, distance : 75.71829689316458
cid 500007
relative close compound id is 20
# of search :5, distance : 63.80531952915894
cid 500008
relative close compound id is 20
# of search :5, distance : 83.76140646109003
cid 500009
relative close compound id is 17
# of search :4, distance : 63.06450570235814
cid 500010
relative close compound id is 20
# of search :5, distance : 36.020917976051706
cid 500012
relative close compound id is 20
# of search :5, distance : 39.34248781888018
cid 500013
relative close compound id is 17
# of search :4, distance : 52.56984712578358
cid 500014
relative close compound id is 14
# of search :4, distance : 92.40267368874382

In [10]:
naive_search(dataset,all_compound)

naive Search Result : 
cid 500002
relative close compound id is 3
# of search : 20, distance : 30.674774528064418
cid 500003
relative close compound id is 20
# of search : 20, distance : 82.000047818361
cid 500004
relative close compound id is 20
# of search : 20, distance : 60.492125924996415
cid 500006
relative close compound id is 20
# of search : 20, distance : 75.71829689316458
cid 500007
relative close compound id is 20
# of search : 20, distance : 63.80531952915894
cid 500008
relative close compound id is 20
# of search : 20, distance : 83.76140646109003
cid 500009
relative close compound id is 15
# of search : 20, distance : 45.3878850867504
cid 500010
relative close compound id is 20
# of search : 20, distance : 36.020917976051706
cid 500012
relative close compound id is 20
# of search : 20, distance : 39.34248781888018
cid 500013
relative close compound id is 20
# of search : 20, distance : 46.49805127890937
cid 500014
relative close compound id is 14
# of search : 20, distan

In [None]:
# Importing the libraries
from config import BASE_DIR
import numpy as np 
from numpy.linalg import norm
import import_ipynb 
import glob
import pickle
import pandas as pd  
import time
import csv 
 
from mol2vect import extract_vectorized_molecules, mass_featurize_sdf2vec,molvec_extractor
from sdf_batch_splitting import split_sdf_batch_in_compounds 
from bTree import Node
import cProfile

In [None]:
# prepare_sdf_for_experiment () : it has three sub function
# 1. split_sdf_batch_in_compounds () -> it read a batch sdf file from folder TEST/BATCH, extract 25 compounds sdf file, store them in TEST/SDF_PER_COMPOUND/
# we will use these 25 compound as a new compound for our project
# mass_featurize_sdf2vec() -> sdf to vec representation for all objects within a directory
# extract_vectorized_molecules() ->  reads in all csv files with smiles and mol2vec information from a given directory, 
# create a csv file with only the vectors, return a dictionary with compound id as key and vector as values  

def prepare_sdf_for_experiment():
    """THIS ROUTINE RUNS ALL THE NEEDED ROUTINES."""

    """BATCH SPLITTING ROUTINE"""
    split_sdf_batch_in_compounds(25,f'{BASE_DIR}/TEST/BATCH/*.sdf',f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/')  

    """MOL2VEC FEATURIZATION ROUTINE""" 
    mass_featurize_sdf2vec(SDF_FILES_DIR=f'{BASE_DIR}/TEST/SDF_PER_COMPOUND/',CSV_FILES_DIR = f'{BASE_DIR}/TEST/CSVS/') 

    """MOL2VEC VECTORIZATION"""
    cid2vect_dict = extract_vectorized_molecules(INPUT_DIR=f'{BASE_DIR}/TEST/CSVS/', OUTPUT_CSV_FILENAME=f'{BASE_DIR}/TEST/vectors_list')
    return cid2vect_dict

In [None]:
def get_Btree():
    # Get tree from pkl,in this tree  , every node consist cluster_name [medoid_Cid,[cluster items],parent_cluster_cid,Child_posiotn_of _its_parent]
    # C581 [683, [516, 61, 683], 'C520', 'r'] 
    # C581 cluster has 3 item, its parent cluster is C520 and it is right child of its parent.
    infile = open(BASE_DIR+'/medoid_cluster_tree.pkl', 'rb')
    tree = pickle.load(infile)
    infile.close() 
    return tree

In [None]:
# vect2cid()-> fetch cid for a given vector
# input : query vector, dictionary 
# output : Return corresponding compound Id
def vect2cid(vect,lookup_table):
    queryres = []
    queryvec = np.asarray(vect)

    for key in lookup_table:
        keyvec = np.asarray((lookup_table[key]))
        if np.array_equal(keyvec, queryvec):
            queryres = key
            break
    return queryres

In [None]:
# get_most_similar() -> search relative close compound from tree, here dataset has 25 compounds vector information for test, 
# for each compound it find relative comdound from tree, 
# input : dataset of vectors
# output : relative close compound to query vector, total and average searching time
def get_most_similar(dataset):
    print('Tree Search Result : ')
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        tree.search_relative_compound(new_compound,1)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))
    

In [None]:
# naive_search()-> 
# input: list of test compounds,dictionary of all compounds 
# output: total execution time, average execution time
def naive_search(dataset,all_compound):
    print('naive Search Result : ') 
    startTime = time.time()
    for index in range(len(dataset)): 
        
        new_compound  = dataset.loc[index].tolist() # read vector from datasset  
        cid = vect2cid([new_compound],lookup_table)
        print("cid "+ str(cid))
        get_relative_compound(new_compound,all_compound)
        
    executionTime = (time.time() - startTime)
    print('Execution time in seconds: ' + str(executionTime))
    print('Avg Execution time in seconds: ' + str(executionTime/len(dataset)))

# get_relative_compound()-> It checks with all compound from dataset.
# input: new compound, dictionary of all compounds 
# output: relative compound, distance between new compound and found compound
def get_relative_compound(new_compound,all_compound):
    dict_cid_dist = {}
    count = 0
    for cid in all_compound:
        compound = all_compound[cid]
        dict_cid_dist[cid] = norm(np.array(new_compound) - np.array(compound))
        count += 1
            
    min_key =  [key for key in  dict_cid_dist if  all( dict_cid_dist[temp] >=  dict_cid_dist[key] for temp in  dict_cid_dist)]
    print('relative close compound id is '+ str(min_key[0]))
    print('# of search : '+ str(count) + ', distance : '+ str(dict_cid_dist[min_key[0]]))
        

In [None]:
if __name__ == '__main__':
    print("Prepare sfd for Experiment")
    lookup_table = prepare_sdf_for_experiment()
    

In [None]:
tree = get_Btree()
# read test dataset from CSV
dataset = pd.read_csv(f'{BASE_DIR}/TEST/vectors_list.csv',header=None)
 
# load all_compound
infile = open(BASE_DIR+'/cid_vect_dict.pkl', 'rb')
all_compound = pickle.load(infile)
infile.close() 

In [None]:
#cProfile.run('get_most_similar(dataset)')
get_most_similar(dataset)

In [None]:
naive_search(dataset,all_compound)