# Getting statistics on the SPOKE graph

In [24]:
from neo4j import GraphDatabase
import pandas as pd

In [6]:
driver = GraphDatabase.driver(uri='bolt://localhost:7687')

In [54]:
# transactions
def get_node_sources(tx):
    result = tx.run('MATCH (n) RETURN LABELS(n), n.source, COUNT(ID(n));')
    return list(result)

def get_node_types(tx):
    return list(tx.run('MATCH (n) RETURN LABELS(n), COUNT(ID(n));'))

def get_edge_types(tx):
    return list(tx.run('MATCH ()-[e]-() RETURN TYPE(e), COUNT(e);'))

def get_edge_sources(tx):
    return list(tx.run('MATCH ()-[e]-() RETURN TYPE(e), e.source, e.sources[0], COUNT(e);'))

In [56]:
with driver.session() as session:
    node_sources = session.execute_read(get_node_sources)
    edge_types = session.execute_read(get_edge_types)

In [55]:
with driver.session() as session:
    edge_sources = session.execute_read(get_edge_sources)

In [58]:
node_sources_list = [x.data() for x in node_sources]
node_sources_list = [(x['LABELS(n)'][0], x['n.source'], x['COUNT(ID(n))']) for x in node_sources_list]
node_sources_table = pd.DataFrame(node_sources_list, columns=['label', 'source', 'count'])
node_sources_table = node_sources_table.sort_values('label')
node_sources_table = node_sources_table.reset_index(drop=True)

## Node statistics

In [59]:
node_sources_table

Unnamed: 0,label,source,count
0,Anatomy,Uberon,14939
1,AnatomyCellType,,102
2,BiologicalProcess,Gene Ontology,13103
3,CellType,,54
4,CellularComponent,Gene Ontology,1704
5,Compound,ChEMBL,1961462
6,Compound,kegg,5771
7,Compound,DrugBank,125
8,Compound,metacyc,42
9,DatabaseTimestamp,,56


## Edge statistics

In [64]:
edge_types_list = [x.data() for x in edge_types]
edge_types_list = [(x['TYPE(e)'], x['COUNT(e)']) for x in edge_types_list]
edge_types_table = pd.DataFrame(edge_types_list, columns=['edge_type', 'count'])
edge_types_table = edge_types_table.sort_values('edge_type')
edge_types_table = edge_types_table.reset_index(drop=True)
edge_types_table

Unnamed: 0,edge_type,count
0,AFFECTS_CamG,29900
1,ASSOCIATES_DaG,84620
2,BINDS_CbP,1697038
3,CATALYZES_ECcR,31350
4,CAUSES_CcSE,238330
5,CAUSES_OcD,1142
6,CONSUMES_RcC,69248
7,CONTAINS_AcA,37370
8,CONTAINS_DcD,21914
9,CONTAINS_FcC,63926


In [65]:
edge_types_table = edge_types_table.sort_values('count', ascending=False)
edge_types_table = edge_types_table.reset_index(drop=True)
edge_types_table

Unnamed: 0,edge_type,count
0,INCLUDES_OiEC,8011858
1,INCLUDES_OiPW,4111376
2,INTERACTS_PiP,3296347
3,EXPRESSES_AeG,3172464
4,BINDS_CbP,1697038
5,EXPRESSES_ACTeG,1419394
6,PARTICIPATES_GpBP,1273416
7,DOWNREGULATES_AdG,989778
8,UPREGULATES_CuG,956286
9,DOWNREGULATES_CdG,944240


## Edge sources

In [74]:
edge_sources_list = [x.data() for x in edge_sources]
edge_sources_list = [(x['TYPE(e)'], x['e.source'], x['e.sources[0]'], x['COUNT(e)']) for x in edge_sources_list]
edge_sources_table = pd.DataFrame(edge_sources_list, columns=['edge_type', 'source', 'sources', 'count'])
edge_sources_table = edge_sources_table.sort_values('edge_type')
edge_sources_table = edge_sources_table.reset_index(drop=True)

In [81]:
edge_sources_table['source'] = edge_sources_table['source'].fillna(edge_sources_table['sources'])
del edge_sources_table['sources']

In [82]:
pd.set_option('display.max_rows', None)
edge_sources_table

Unnamed: 0,edge_type,source,count
0,AFFECTS_CamG,CancerRX,24924
1,AFFECTS_CamG,CIVIC,4976
2,ASSOCIATES_DaG,OMIM,6148
3,ASSOCIATES_DaG,GWAS,10124
4,ASSOCIATES_DaG,DISEASES,68348
5,BINDS_CbP,DrugCentral,34594
6,BINDS_CbP,BindingDB,1662444
7,CATALYZES_ECcR,,31350
8,CAUSES_CcSE,SIDER 4.1,238330
9,CAUSES_OcD,,2
