# Getting statistics on the CKG graph

The only difference between this and the SPOKE notebook is that I'm currently running the CKG neo4j.

In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [2]:
driver = GraphDatabase.driver(uri='bolt://localhost:7687')

In [3]:
# transactions
def get_node_sources(tx):
    result = tx.run('MATCH (n) RETURN LABELS(n), n.source, COUNT(ID(n));')
    return list(result)

def get_node_types(tx):
    return list(tx.run('MATCH (n) RETURN LABELS(n), COUNT(ID(n));'))

def get_edge_types(tx):
    return list(tx.run('MATCH ()-[e]-() RETURN TYPE(e), COUNT(e);'))

def get_edge_sources(tx):
    return list(tx.run('MATCH ()-[e]-() RETURN TYPE(e), e.source, e.sources[0], COUNT(e);'))

In [4]:
with driver.session() as session:
    node_sources = session.execute_read(get_node_sources)
    edge_types = session.execute_read(get_edge_types)

In [9]:
with driver.session() as session:
    edge_sources = session.execute_read(get_edge_sources)

In [5]:
node_sources_list = [x.data() for x in node_sources]
node_sources_list = [(x['LABELS(n)'][0], x['n.source'], x['COUNT(ID(n))']) for x in node_sources_list]
node_sources_table = pd.DataFrame(node_sources_list, columns=['label', 'source', 'count'])
node_sources_table = node_sources_table.sort_values('label')
node_sources_table = node_sources_table.reset_index(drop=True)

## Node statistics

In [6]:
node_sources_table

Unnamed: 0,label,source,count
0,Amino_acid_sequence,UniProt,20614
1,Analytical_sample,,172
2,Biological_process,,28642
3,Biological_sample,,170
4,Cellular_component,,4176
5,Chromosome,,25
6,Clinically_relevant_variant,CGI,6856
7,Clinically_relevant_variant,,183478
8,Complex,CORUM,2700
9,Disease,,10791


## Edge statistics

In [7]:
edge_types_list = [x.data() for x in edge_types]
edge_types_list = [(x['TYPE(e)'], x['COUNT(e)']) for x in edge_types_list]
edge_types_table = pd.DataFrame(edge_types_list, columns=['edge_type', 'count'])
edge_types_table = edge_types_table.sort_values('edge_type')
edge_types_table = edge_types_table.reset_index(drop=True)
edge_types_table

Unnamed: 0,edge_type,count
0,ACTS_ON,1977410
1,ANNOTATED_IN_PATHWAY,2407618
2,ASSOCIATED_WITH,33415258
3,BELONGS_TO_PROTEIN,7258116
4,BELONGS_TO_SUBJECT,340
5,COMPILED_INTERACTS_WITH,3913224
6,CURATED_AFFECTS_INTERACTION_WITH,21746
7,CURATED_INTERACTS_WITH,595961
8,DETECTED_IN_PATHOLOGY_SAMPLE,3394496
9,FOUND_IN_PROTEIN,408488


In [8]:
edge_types_table = edge_types_table.sort_values('count', ascending=False)
edge_types_table = edge_types_table.reset_index(drop=True)
edge_types_table

Unnamed: 0,edge_type,count
0,MENTIONED_IN_PUBLICATION,222218476
1,VARIANT_FOUND_IN_PROTEIN,53614586
2,ASSOCIATED_WITH,33415258
3,VARIANT_FOUND_IN_GENE,21277870
4,VARIANT_FOUND_IN_CHROMOSOME,21260216
5,BELONGS_TO_PROTEIN,7258116
6,COMPILED_INTERACTS_WITH,3913224
7,DETECTED_IN_PATHOLOGY_SAMPLE,3394496
8,ANNOTATED_IN_PATHWAY,2407618
9,ACTS_ON,1977410


## Edge sources

In [10]:
edge_sources_list = [x.data() for x in edge_sources]
edge_sources_list = [(x['TYPE(e)'], x['e.source'], x['e.sources[0]'], x['COUNT(e)']) for x in edge_sources_list]
edge_sources_table = pd.DataFrame(edge_sources_list, columns=['edge_type', 'source', 'sources', 'count'])
edge_sources_table = edge_sources_table.sort_values('edge_type')
edge_sources_table = edge_sources_table.reset_index(drop=True)

In [11]:
edge_sources_table['source'] = edge_sources_table['source'].fillna(edge_sources_table['sources'])
del edge_sources_table['sources']

In [12]:
pd.set_option('display.max_rows', None)
edge_sources_table

Unnamed: 0,edge_type,source,count
0,ACTS_ON,STRING,1977410
1,ANNOTATED_IN_PATHWAY,Reactome,117270
2,ANNOTATED_IN_PATHWAY,SMPDB,2290348
3,ASSOCIATED_WITH,DisGeNet: ORPHANET;UNIPROT,52
4,ASSOCIATED_WITH,DisGeNet: CLINGEN;GENOMICS_ENGLAND;ORPHANET;UN...,12
5,ASSOCIATED_WITH,DisGeNet: CGI;UNIPROT,106
6,ASSOCIATED_WITH,DisGeNet: CGI;GENOMICS_ENGLAND,128
7,ASSOCIATED_WITH,DisGeNet: CGI;CLINGEN;CTD_human;GENOMICS_ENGLAND,10
8,ASSOCIATED_WITH,DisGeNet: CLINGEN;GENOMICS_ENGLAND;UNIPROT,22
9,ASSOCIATED_WITH,DisGeNet: CGI;GENOMICS_ENGLAND;UNIPROT,10
