# Workflow for Extracting Metadata from Graph Databases

In [148]:
import datetime
import pandas as pd
from py2neo import Graph

In [149]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
graph = Graph("bolt://neo4j.het.io:7687")

In [139]:
def self_describe(url, type, credentials=None):
    
    '''
    This function takes in an URL pointer to a Knowledge graph and return summary statistics 
    which can used for metadata analysis. 
    
    Args:
        url: URL pointer to KG
        type: Type of Knowledge Graph: NEO4J, GraphDB etc. 
        credentials: Local credentials file containing authorization if necessary.
    '''
    
    user, password = None, None
    
    if credentials:
        with open(credentials) as file:
            contents = file.readlines()
            user = (contents[0].split(':')[1]).strip()
            password = (contents[1].split(':')[1]).strip()
            

    graph = Graph(url, user = user, password=password)
    global_summary_stats_query = """MATCH (n) 
            WITH labels(n) as labels, size(keys(n)) as props, size((n)--()) as degree
            RETURN
            count(*) AS NumofNodes,
            avg(props) AS AvgNumOfPropPerNode,
            min(props) AS MinNumPropPerNode,
            max(props) AS MaxNumPropPerNode,
            avg(degree) AS AvgNumOfRelationships,
            min(degree) AS MinNumOfRelationships,
            max(degree) AS MaxNumOfRelationships
            """
    
    relationships_query = 'MATCH ()-[r]->() RETURN count(r) as count_relationships;'
    num_relationships = graph.run(relationships_query).to_series()
    
    dataframe_global = graph.run(global_summary_stats_query).to_data_frame()
    dataframe_global['NumRelationships'] = num_relationships.values
    
    
    label_wise_stats = '''MATCH (n) 
    WITH labels(n) as labels, size(keys(n)) as props, size((n)--()) as degree
    RETURN
    DISTINCT labels,
    count(*) AS NumofNodes,
    avg(props) AS AvgNumOfPropPerNode,
    min(props) AS MinNumPropPerNode,
    max(props) AS MaxNumPropPerNode,
    avg(degree) AS AvgNumOfRelationships,
    min(degree) AS MinNumOfRelationships,
    max(degree) AS MaxNumOfRelationships'''

    dataframe_local = graph.run(label_wise_stats).to_data_frame()
    
    #Schema level meta data
    graph_counts = graph.run("CALL db.stats.retrieve('GRAPH COUNTS');").to_data_frame()
    graph_tokens = graph.run("CALL db.stats.retrieve('TOKENS');").to_data_frame()
    
    properties = graph_tokens['data'].apply(pd.Series)['propertyKeys'][0]
    nested_columns = graph_counts.data.apply(pd.Series)
    
    # Return list of all contraints 
    constraints = nested_columns.constraints.apply(pd.DataFrame)[0]
    
    # Return list of all relationships based off schema
    relationships = nested_columns.relationships.apply(pd.DataFrame)[0].fillna(method='bfill').drop_duplicates(subset = ['relationshipType'], keep='first').drop('count', axis = 1).reset_index(drop = True)
    
    
    return dataframe_global, dataframe_local, relationships, constraints, properties

In [141]:
dataframe_global, dataframe_local, relationships, constraints, properties = self_describe("bolt://neo4j.het.io:7687", type='NEO-4J')

In [147]:
### For each node get a list of the propertir
### Figure out if its a CURIE or a URL etc. and then help to match it across the KGs
### Entity recogonition on the properties itself --> NEMO, BERN etc. 
### https://covidgraph.org/

properties

['identifier',
 'name',
 'source',
 'url',
 'description',
 'license',
 'chromosome',
 'inchi',
 'inchikey',
 'class_type',
 'mesh_id',
 'bto_id',
 'sources',
 'unbiased',
 'subtypes',
 'method',
 'log2_fold_change',
 'z_score',
 'actions',
 'pubmed_ids',
 'similarity',
 'affinity_nM',
 'urls']

In [132]:
! code credentials.cred

In [135]:
dataframe_global, dataframe_local, relationships, constraints = self_describe("http://132.249.238.185:7474/browser/", type='NEO-4J', credentials='credentials.cred')

ClientError: Forbidden: Permission denied.