# Hierarchy-Based Semantic Encoding 
Input Required
    1. A csv file containing hierarchy in the form of two columns: parent node and child node or (subject object)
    2. Root node of the hierarchy
<br>
Output:
Dataframe, each row represnts embeddings for a uniqe value. 
<br>

Steps for calculating semantic embeddings
1. Load the hierarchy from CSV into the networkx Graph
2. Convert the graph into tree by specifying the root node of the hierarchy
3. Create an attribute "node-level" for each node to store the level of the node
4. Create unique paris for all nodes
5. For each pair(i,j), find it's lowest common ancestor and replace it with it's level
6. Create adjancey matrix where row and columns represents all nodes and each element(i,j) represents level of lowest common ancestor
7. Use any similarity function in range (0,1) to calculate similarity (here we calculate similarity using our proposed measure: hierarchy-based semantic similarity)



In [192]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import timeit
import csv
from sklearn import tree, linear_model
import matplotlib.pyplot as plt
import math
import itertools
from multiprocessing import Pool, cpu_count
from anytree import Node, RenderTree
import os
import networkx as nx
import glob
%matplotlib inline

In [193]:
# filename : contains input filename with full path 
# file should have at least two columns named 'subject' and 'object'
# rootnode: specify the url for root node of the hierarchy
# target_filename: save embeddings with key name
# lambda_factor : tuneable factor to create embeddings, read http://ceur-ws.org/Vol-2600/paper16.pdf
def create_embeddings(filename, rootnode,target_filename, lambda_factor=0.6):
    df = pd.read_csv(filename)
    # Create the Directed Graph 
    G = nx.from_pandas_edgelist(df,
                            source='object',
                            target='subject',
                            create_using=nx.DiGraph())
    # create tree by specifying root node
    tree = nx.bfs_tree(G, rootnode) #
    # find level of node(shortest path from root to current node)
    optional_attrs = nx.shortest_path_length(tree ,rootnode)
    nx.set_node_attributes(tree ,  optional_attrs, 'node_level' )
    
    ls_leafnodes = [node for node in tree.nodes()]
    pairs = list(itertools.product(ls_leafnodes, repeat=2)) # create pair of all nodes 
    all_ancestors = nx.algorithms.all_pairs_lowest_common_ancestor(tree, pairs=pairs) # get lowest common ancestors of alll pairs of nodes


    # replace ancestor node with its level in the hierarchy
    ls_ancestors_levels = {}
    for i in all_ancestors:
        ls_ancestors_levels[i[0]] = tree.node[i[1]]['node_level'] 
        
    chunked_data = [[k[0],k[1], v] for k, v in ls_ancestors_levels.items()]
    df_nodes = pd.DataFrame(chunked_data)
    df_nodes = df_nodes.rename(columns= {0:'node1', 1:'node2', 2:'weight'})
    depth = df_nodes.weight.max() # find the maximum levels in the hierarchy

    # create adjancey matrix
    vals = np.unique(df_nodes[['node1', 'node2']])
    df_nodes = df_nodes.pivot(index='node1', columns='node2', values='weight'
                      ).reindex(columns=vals, index=vals, fill_value=0)

    df_adjacency = df_nodes.apply( lambda x:  np.power(  lambda_factor, depth - x))

    # set diagnoal to 1
    pd.DataFrame.set_diag = set_diag
    df_adjacency.set_diag(1)
    df_adjacency.fillna(0, inplace=True)


    df_adjacency.to_csv(filepath+'/embeddings/all_nodes'+target_filename)









In [194]:
def set_diag(self, values): 
    n = min(len(self.index), len(self.columns))
    self.values[tuple([np.arange(n)] * 2)] = values

    


### Store root nodes for all hierarchies

In [195]:
root_dict = { 'Inorganic_': 'http://id.nlm.nih.gov/mesh/D007287',
                'Organic_chemicals' : 'http://id.nlm.nih.gov/mesh/D009930',
               'Hetrocyclic_compounds' :'http://id.nlm.nih.gov/mesh/D006571',
                'Polycyclic_compounds':        'http://id.nlm.nih.gov/mesh/D011083',
                'Macromolecular_substances':    'http://id.nlm.nih.gov/mesh/D046911',
                'Hormones_Hormone_Substitutes_Hormone_Antagonists':   'http://id.nlm.nih.gov/mesh/D006730',
                'Enzymes_and_Coenzymes' :        'http://id.nlm.nih.gov/mesh/D045762',
                 'Carbohydrates':      'http://id.nlm.nih.gov/mesh/D002241',
                  'Lipids':    'http://id.nlm.nih.gov/mesh/D008055',
                 'Amino_Acids_Peptides_Proteins' :   'http://id.nlm.nih.gov/mesh/D000602',
                   'Nucleic_Acids_Nucleotides_Nucleosides':   'http://id.nlm.nih.gov/mesh/D009706',
                  'Complex_Mixtures':  'http://id.nlm.nih.gov/mesh/D045424',
                   'Biological_Factors' : 'http://id.nlm.nih.gov/mesh/D001685',
                    'biomedical_Dental_Materials':'http://id.nlm.nih.gov/mesh/D001697',
                   'Pharmaceutical_Preparations': 'http://id.nlm.nih.gov/mesh/D004364',
                    'Chemical_Actions_Uses' : 'http://id.nlm.nih.gov/mesh/D020164',
                    'taxonomy': 'https://www.ncbi.nlm.nih.gov/taxonomy/taxon/1'}


# fetch all csv files that contains hierarchy pair (subject object pair)
filepath = os.getcwd()
file_list = glob.glob(filepath +'/data/*.csv')


### Create embeddings for all hierarchies

In [196]:
# 0 < lambda_factor < 1
#
lambda_factor = 0.7
for key, root_node in root_dict.items():
    if key in str(file_list ):
        filename = [i for i in file_list if key in i]
        for file in filename:
            print ('Processing file ..',file.rpartition("/")[2] )
            create_embeddings(file,  root_node , file.rpartition("/")[2], lambda_factor)

        
   

Processing file .. Inorganic_chemicals_hierarchy.csv
Processing file .. Organic_chemicals_hierarchy.csv
Processing file .. Hetrocyclic_compounds_hierarchy.csv
Processing file .. Polycyclic_compounds_hierarchy.csv
Processing file .. Macromolecular_substances_hierarchy.csv
Processing file .. Hormones_Hormone_Substitutes_Hormone_Antagonists_hierarchy.csv
Processing file .. Carbohydrates_hierarchy.csv
Processing file .. Lipids_hierarchy.csv
Processing file .. Amino_Acids_Peptides_Proteins_hierarchy.csv
Processing file .. Biological_Factors_hierarchy.csv
Processing file .. biomedical_Dental_Materials_hierarchy.csv
Processing file .. Chemical_Actions_Uses_hierarchy.csv
Processing file .. taxonomy_hierarchy_only0.csv
