Skip to content

Commit

Permalink
complete run of notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Justin Reese committed Sep 29, 2020
1 parent 5205d0e commit a3f5d0c
Showing 1 changed file with 179 additions and 11 deletions.
190 changes: 179 additions & 11 deletions Run-KG-COVID-19-pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -129,7 +129,47 @@
"Loading country codes: 264it [00:00, 238538.62it/s]\n",
"Unzipping files: 100%|███████████████████████████| 2/2 [03:30<00:00, 105.07s/it]\n",
"100%|█████████████████████████████████████| 54137/54137 [11:09<00:00, 80.84it/s]\n",
"100%|█████████████████████████████████████| 75785/75785 [17:06<00:00, 73.86it/s]\n"
"100%|█████████████████████████████████████| 75785/75785 [17:06<00:00, 73.86it/s]\n",
"WARNING:root:Found >1 DB_Object_Name in rec, using the first one\n",
"Parsing data/raw/go-plus.json\n",
"WARNING:ToolkitGenerator:Range of slot 'treated by' (named thing) does not line with the domain of its inverse (treats)\n",
"WARNING:ToolkitGenerator:Range of slot 'enabled by' (named thing) does not line with the domain of its inverse (enables)\n",
"WARNING:ToolkitGenerator:Range of slot 'superclass of' (iri type) does not line with the domain of its inverse (subclass of)\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: SEMMEDDB\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: UBERON_CORE\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: WD\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: chembio\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: PHAROS\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: GTEx\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: ExO\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: HANCESTRO\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: ORPHA\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: medgen\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: VMC\n",
"WARNING:ToolkitGenerator:Unrecognized prefix: ECTO\n",
"[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/go-plus.json\n",
"[KGX][json_transformer.py][ load_nodes] INFO: Loading 80507 nodes into networkx.MultiDiGraph\n",
"[KGX][json_transformer.py][ load_edges] INFO: Loading 170564 edges into networkx.MultiDiGraph\n",
"Parsing data/raw/hp.json\n",
"[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/hp.json\n",
"[KGX][json_transformer.py][ load_nodes] INFO: Loading 15536 nodes into networkx.MultiDiGraph\n",
"[KGX][json_transformer.py][ load_edges] INFO: Loading 19395 edges into networkx.MultiDiGraph\n",
"Parsing data/raw/mondo.json\n",
"[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/mondo.json\n",
"[KGX][json_transformer.py][ load_nodes] INFO: Loading 24279 nodes into networkx.MultiDiGraph\n",
"[KGX][json_transformer.py][ load_edges] INFO: Loading 47822 edges into networkx.MultiDiGraph\n",
"Parsing data/raw/chebi.json.gz\n",
"[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/chebi.json.gz\n",
"[KGX][json_transformer.py][ load_nodes] INFO: Loading 144674 nodes into networkx.MultiDiGraph\n",
"[KGX][json_transformer.py][ load_edges] INFO: Loading 276297 edges into networkx.MultiDiGraph\n",
"Decompressing\n",
"Parsing data/raw/lifted-go-cams-20200619.xml\n",
"[KGX][rdf_transformer.py][ parse] INFO: Parsing data/raw/lifted-go-cams-20200619.xml with 'None' format\n",
"[KGX][rdf_transformer.py][ parse] INFO: data/raw/lifted-go-cams-20200619.xml parsed with 36281 triples\n",
"[KGX][rdf_transformer.py][ dereify] INFO: Dereifying 4587 nodes\n",
"[KGX][rdf_transformer.py][ parse] INFO: Done parsing data/raw/lifted-go-cams-20200619.xml\n",
"[KGX][transformer.py][ report] INFO: Total nodes in graph: 3681\n",
"[KGX][transformer.py][ report] INFO: Total edges in graph: 3724\n"
]
}
],
Expand All @@ -154,9 +194,82 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'drug-central'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'pharmgkb'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'STRING'\n",
"[KGX][cli_utils.py][ apply_filters] INFO: with node filters: {'category': ['biolink:Gene', 'biolink:Protein']}\n",
"[KGX][cli_utils.py][ apply_filters] INFO: with edge filters: {'subject_category': ['biolink:Gene', 'biolink:Protein'], 'object_category': ['biolink:Gene', 'biolink:Protein'], 'edge_label': ['biolink:interacts_with', 'biolink:has_gene_product']}\n",
"[KGX][cli_utils.py][ apply_operations] INFO: Applying operation kgx.utils.graph_utils.remap_node_identifier with args: {'category': 'biolink:Protein', 'alternative_property': 'xrefs', 'prefix': 'UniProtKB'}\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'ttd'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'zhou-host-proteins'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'SciBite-CORD-19'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'sars-cov-2-gene-annot'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'intact'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'chembl'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'gene-ontology'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'mondo-ontology'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'hp-ontology'\n",
"[KGX][cli_utils.py][ parse_target] INFO: Processing target 'go-cams'\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 3753 nodes from drug-central to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 13900 edges from drug-central to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and drug-central: 1448\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and drug-central: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2432 nodes from pharmgkb to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 5715 edges from pharmgkb to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and pharmgkb: 971\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and pharmgkb: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 29089 nodes from ttd to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 82668 edges from ttd to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and ttd: 206\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and ttd: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 125 nodes from zhou-host-proteins to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 127 edges from zhou-host-proteins to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and zhou-host-proteins: 0\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and zhou-host-proteins: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 188684 nodes from SciBite-CORD-19 to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 9257840 edges from SciBite-CORD-19 to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and SciBite-CORD-19: 114\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and SciBite-CORD-19: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2528 nodes from sars-cov-2-gene-annot to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 46150 edges from sars-cov-2-gene-annot to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and sars-cov-2-gene-annot: 84\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and sars-cov-2-gene-annot: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2461 nodes from intact to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 1093 edges from intact to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and intact: 1909\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and intact: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 6974 nodes from chembl to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 7357 edges from chembl to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and chembl: 2774\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and chembl: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 80614 nodes from gene-ontology to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 170521 edges from gene-ontology to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and gene-ontology: 10335\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and gene-ontology: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 30024 nodes from mondo-ontology to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 47809 edges from mondo-ontology to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and mondo-ontology: 1558\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and mondo-ontology: 167\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 15536 nodes from hp-ontology to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 19395 edges from hp-ontology to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and hp-ontology: 5617\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and hp-ontology: 0\n",
"[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 3681 nodes from go-cams to STRING\n",
"[KGX][graph_merge.py][ add_all_edges] INFO: Adding 3724 edges from go-cams to STRING\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and go-cams: 237\n",
"[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and go-cams: 0\n",
"[KGX][cli_utils.py][ apply_operations] INFO: Applying operation kgx.operations.summarize_graph.generate_graph_stats with args: {'graph_name': 'KG-COVID-19 Graph', 'filename': 'merged_graph_stats.yaml', 'node_facet_properties': ['provided_by'], 'edge_facet_properties': ['provided_by']}\n",
"[KGX][cli_utils.py][ merge] INFO: Writing merged graph to merged-kg-tsv\n"
]
}
],
"source": [
"!python run.py merge"
]
Expand All @@ -173,6 +286,13 @@
"https://kg-hub.berkeleybop.io/kg-covid-19/index.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Other tooling/functionality"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -191,9 +311,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"merged-kg_nodes.tsv\n",
"merged-kg_edges.tsv\n"
]
}
],
"source": [
"!tar -xvzf data/merged/merged-kg.tar.gz"
]
Expand All @@ -207,11 +336,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:root:Loading graph from nodes merged-kg_nodes.tsv and edges merged-kg_edges.tsv files\n",
"INFO:root:Making positive edges\n",
"INFO:root:Making negative edges\n",
"INFO:root:Writing out positive edges\n",
"INFO:root:Writing out negative edges\n"
]
}
],
"source": [
"!python run.py holdouts -e merged-kg_edges.tsv -n merged-kg_nodes.tsv # this might take 10 minutes or so"
]
Expand All @@ -225,9 +366,36 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'degrees_max': '72381',\n",
" 'edges_number': '24761540',\n",
" 'degrees_min': '0',\n",
" 'connected_components_number': '24907',\n",
" 'degrees_mean': '65.5801068391348',\n",
" 'degrees_median': '5',\n",
" 'selfloops_rate': '0.0000155886911718738',\n",
" 'is_directed': 'false',\n",
" 'density': '0.00017368670983437763',\n",
" 'strongly_connected_components_number': '24907',\n",
" 'singleton_nodes': '23320',\n",
" 'unique_node_types_number': '37',\n",
" 'unique_edge_types_number': '33',\n",
" 'degrees_mode': '1',\n",
" 'traps_rate': '0.06176223657691014',\n",
" 'bidirectional_rate': '1',\n",
" 'nodes_number': '377577'}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from ensmallen_graph import EnsmallenGraph\n",
"\n",
Expand Down Expand Up @@ -293,7 +461,7 @@
"source": [
"## Use prebuilt SPARQL queries to query our Blazegraph endpoint on the commandline\n",
"\n",
"KG-COVID-19 has tooling to query our Blazegraph endpoint using predetermined SPARQL queries, and emit the results as a TSV file. Different SPARQL queries on our endpoint or other endpoints can be used by creating a new YAML file and specific this filewith the `-y` flag. "
"KG-COVID-19 has tooling to query our Blazegraph endpoint using predetermined SPARQL queries, and emit the results as a TSV file. Different SPARQL queries on our endpoint or other endpoints can be used by creating a new YAML file and specifying this file with the `-y` flag. "
]
},
{
Expand Down

0 comments on commit a3f5d0c

Please sign in to comment.