In [None]:
import import_ipynb
import operator
from config import BASE_DIR
import pickle
import numpy as np
from math import pow
from extract_info import cid2vect

In [None]:
# load medoid dictionary
def map_medoids():
    try:
        # load mapping
        infile = open(BASE_DIR+'/medoid_cluster_dict.pkl', 'rb')
        mapping = pickle.load(infile)
        infile.close()
        return mapping 
    except IOError:
        print("File Not Found or wrong path")
    

# computing variance
# returns dictionary [key -> clusterID: value -> [Bounding Box, MAD value]]
# input medoids
def compute_variance(medoids):
    
    #keys = list(medoids.keys()) 
    mad_dict = dict()
    clusters = list(medoids.items())

    for cluster_list in clusters:
        med = cluster_list[1][0] # check whether medoid is None
        if med is not None:
            medoid = cluster_list[1][0]
            medoid_vec = cid2vect(medoid)[0] # medoid vector
            
            # extracting vectors for each compounds from the cluster
            cluster_vec = [cid2vect(x) for x in cluster_list[1][1]] 
            cluster_bbox = get_bbox(cluster_vec) # extracting bounding box of current cluster
            cluster = [x for vec in cluster_vec for x in vec]
            key = cluster_list[0]
            mad_value = MAD(cluster, medoid_vec) # computing variance for current cluster
            temp = {str(key): [cluster_bbox, mad_value]} # [clusterID, Bounding Box, MAD]
            mad_dict.update(temp)
        else:
            parent_cluster = cluster_list[1][2]
            temp = {cluster_list[0] :mad_dict.get(parent_cluster)}
            mad_dict.update(temp)
    print(len(mad_dict)) 
    return mad_dict

#where vec is each compound in sample space (cluster) and medoid is an average of sample sapce
# input medoid and a cluster
def get_median_absolute_deviation(cluster, medoid):
    
    deviations = []
    cluster = np.sort(cluster, axis=0)
    for vec in cluster:
        values = np.abs(np.subtract(vec, medoid)) # distance between medoid and all compounds in the cluster
        deviations.append(list(values))
        
    absolute_deviation = get_absolute_deviations(deviations)
    sum_of_abs_deviation = sum(absolute_deviation)
    
    
    return sum_of_abs_deviation  

# get compound values on each 
def get_absolute_deviations(cluster):
    absolute_deviations = []
    for vec in zip(*cluster):
        values = list((vec))
        values = np.median(values)
        absolute_deviations.append(pow(values, 2))
        
    return absolute_deviations


# generate lower bound and upper bound
LB_N_UB = lambda vec : [np.min(vec, axis=0), np.max(vec, axis=0)]


# creating dictionary where each cluster have bounding box and MAD value
def store_mad_dict(mad_dict):
    filename = 'variance_dict.pkl'
    outfile = open(filename,'wb+')
    pickle.dump(mad_dict,outfile)
    outfile.close 
    
# load variance dictionary    
def map_variance():
    try:
        # load mapping
        infile = open(BASE_DIR+'/variance_dict.pkl', 'rb')
        mapping = pickle.load(infile)
        infile.close()
        return mapping 
    except IOError:
        print("File Not Found or wrong path")  
        

# creating bounding box for each dimentions(particularly in our case its 300 dims) 
# of every compound in the cluster 
# input cluster
def get_bbox(vec): 
    bbox = LB_N_UB(vec)
    box = []
    for index in range(0, 300):
        temp = [bbox[0][0][index], bbox[1][0][index]]
        box.append(temp)
    return box

# computing variance
MAD = lambda X, median : np.sqrt(get_median_absolute_deviation(X, median))


In [None]:
# mapping computed medoids (require for the computation of variance)
medoid_list = map_medoids()
#medoid_list

# yielding medoid and cluster from the list of cluster in tree
cluster_info = medoid_list.get('C0') # cluster and medoid

medoid = cid2vect(cluster_info[0]) # medoid
cluster = [cid2vect(comp) for comp in cluster_info[1]] # cluster 

# Display Bounding Box for One Cluster
bounding_box = get_bbox(cluster)

median_absolute_deviation = MAD(cluster, medoid)
print("\n\nVariance : ", median_absolute_deviation)




In [None]:
# for storing MADs and bounding boxes for each cluster
mad_dict = compute_variance(medoid_list)
store_mad_dict(mad_dict)

In [None]:
# Display stored bounding boxes
var = map_variance()
print(var)