diff --git a/example-KG-COVID-19-usage.ipynb b/example-KG-COVID-19-usage.ipynb new file mode 100644 index 00000000..8a887775 --- /dev/null +++ b/example-KG-COVID-19-usage.ipynb @@ -0,0 +1,577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Running KG-COVID-19 pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KG-COVID-19 pipeline can be run on the command line or via this notebook. The goal here is to run the pipeline end-to-end. \n", + "\n", + "We will also demonstrates some ways that you can use the KG downstream, and show some other features of the framework.\n", + "\n", + "**Note:** This notebook assumes that you have already installed the required dependencies for KG-COVID-19. For more information refer to [Installation instructions](https://github.com/Knowledge-Graph-Hub/kg-covid-19/wiki#installation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Downloading all required datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we start with downloading all required datasets as listed in [download.yaml](../download.yaml)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Downloading files: 0%| | 0/24 [00:00\n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial-BoldMT'\n", + "Sep 28, 2020 4:12:20 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:20 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: Your current java version is: 1.8.0_161\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: To get higher rendering speed on old java 1.8 or 9 versions,\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: update to the latest 1.8 or 9 version (>= 1.8.0_191 or >= 9.0.4),\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: or\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.rendering.PDFRenderer suggestKCMS\n", + "INFO: or call System.setProperty(\"sun.java2d.cmm\", \"sun.java2d.cmm.kcms.KcmsServiceProvider\")\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial-BoldMT'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial-BoldMT'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:21 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'ArialMT'\n", + "Sep 28, 2020 4:12:22 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont \n", + "WARNING: Using fallback font 'LiberationSans' for 'Arial'\n", + "\n", + "5782864it [00:21, 270243.44it/s]\n", + "5782864it [00:21, 271635.01it/s]\n", + "Loading gene info: 28496648it [01:33, 304282.49it/s]\n", + "Loading country codes: 264it [00:00, 238538.62it/s]\n", + "Unzipping files: 100%|███████████████████████████| 2/2 [03:30<00:00, 105.07s/it]\n", + "100%|█████████████████████████████████████| 54137/54137 [11:09<00:00, 80.84it/s]\n", + "100%|█████████████████████████████████████| 75785/75785 [17:06<00:00, 73.86it/s]\n", + "WARNING:root:Found >1 DB_Object_Name in rec, using the first one\n", + "Parsing data/raw/go-plus.json\n", + "WARNING:ToolkitGenerator:Range of slot 'treated by' (named thing) does not line with the domain of its inverse (treats)\n", + "WARNING:ToolkitGenerator:Range of slot 'enabled by' (named thing) does not line with the domain of its inverse (enables)\n", + "WARNING:ToolkitGenerator:Range of slot 'superclass of' (iri type) does not line with the domain of its inverse (subclass of)\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: SEMMEDDB\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: UBERON_CORE\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: WD\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: chembio\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: PHAROS\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: GTEx\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: ExO\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: HANCESTRO\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: ORPHA\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: medgen\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: VMC\n", + "WARNING:ToolkitGenerator:Unrecognized prefix: ECTO\n", + "[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/go-plus.json\n", + "[KGX][json_transformer.py][ load_nodes] INFO: Loading 80507 nodes into networkx.MultiDiGraph\n", + "[KGX][json_transformer.py][ load_edges] INFO: Loading 170564 edges into networkx.MultiDiGraph\n", + "Parsing data/raw/hp.json\n", + "[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/hp.json\n", + "[KGX][json_transformer.py][ load_nodes] INFO: Loading 15536 nodes into networkx.MultiDiGraph\n", + "[KGX][json_transformer.py][ load_edges] INFO: Loading 19395 edges into networkx.MultiDiGraph\n", + "Parsing data/raw/mondo.json\n", + "[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/mondo.json\n", + "[KGX][json_transformer.py][ load_nodes] INFO: Loading 24279 nodes into networkx.MultiDiGraph\n", + "[KGX][json_transformer.py][ load_edges] INFO: Loading 47822 edges into networkx.MultiDiGraph\n", + "Parsing data/raw/chebi.json.gz\n", + "[KGX][json_transformer.py][ parse] INFO: Parsing data/raw/chebi.json.gz\n", + "[KGX][json_transformer.py][ load_nodes] INFO: Loading 144674 nodes into networkx.MultiDiGraph\n", + "[KGX][json_transformer.py][ load_edges] INFO: Loading 276297 edges into networkx.MultiDiGraph\n", + "Decompressing\n", + "Parsing data/raw/lifted-go-cams-20200619.xml\n", + "[KGX][rdf_transformer.py][ parse] INFO: Parsing data/raw/lifted-go-cams-20200619.xml with 'None' format\n", + "[KGX][rdf_transformer.py][ parse] INFO: data/raw/lifted-go-cams-20200619.xml parsed with 36281 triples\n", + "[KGX][rdf_transformer.py][ dereify] INFO: Dereifying 4587 nodes\n", + "[KGX][rdf_transformer.py][ parse] INFO: Done parsing data/raw/lifted-go-cams-20200619.xml\n", + "[KGX][transformer.py][ report] INFO: Total nodes in graph: 3681\n", + "[KGX][transformer.py][ report] INFO: Total edges in graph: 3724\n" + ] + } + ], + "source": [ + "!python run.py transform" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merge all datasets into a single graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we create a merged graph by reading in the individual nodes.tsv and edges.tsv and merging them. \n", + "The merge process is driven by the [merge.yaml](../merge.yaml)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'drug-central'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'pharmgkb'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'STRING'\n", + "[KGX][cli_utils.py][ apply_filters] INFO: with node filters: {'category': ['biolink:Gene', 'biolink:Protein']}\n", + "[KGX][cli_utils.py][ apply_filters] INFO: with edge filters: {'subject_category': ['biolink:Gene', 'biolink:Protein'], 'object_category': ['biolink:Gene', 'biolink:Protein'], 'edge_label': ['biolink:interacts_with', 'biolink:has_gene_product']}\n", + "[KGX][cli_utils.py][ apply_operations] INFO: Applying operation kgx.utils.graph_utils.remap_node_identifier with args: {'category': 'biolink:Protein', 'alternative_property': 'xrefs', 'prefix': 'UniProtKB'}\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'ttd'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'zhou-host-proteins'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'SciBite-CORD-19'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'sars-cov-2-gene-annot'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'intact'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'chembl'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'gene-ontology'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'mondo-ontology'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'hp-ontology'\n", + "[KGX][cli_utils.py][ parse_target] INFO: Processing target 'go-cams'\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 3753 nodes from drug-central to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 13900 edges from drug-central to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and drug-central: 1448\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and drug-central: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2432 nodes from pharmgkb to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 5715 edges from pharmgkb to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and pharmgkb: 971\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and pharmgkb: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 29089 nodes from ttd to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 82668 edges from ttd to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and ttd: 206\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and ttd: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 125 nodes from zhou-host-proteins to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 127 edges from zhou-host-proteins to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and zhou-host-proteins: 0\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and zhou-host-proteins: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 188684 nodes from SciBite-CORD-19 to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 9257840 edges from SciBite-CORD-19 to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and SciBite-CORD-19: 114\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and SciBite-CORD-19: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2528 nodes from sars-cov-2-gene-annot to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 46150 edges from sars-cov-2-gene-annot to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and sars-cov-2-gene-annot: 84\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and sars-cov-2-gene-annot: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 2461 nodes from intact to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 1093 edges from intact to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and intact: 1909\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and intact: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 6974 nodes from chembl to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 7357 edges from chembl to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and chembl: 2774\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and chembl: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 80614 nodes from gene-ontology to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 170521 edges from gene-ontology to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and gene-ontology: 10335\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and gene-ontology: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 30024 nodes from mondo-ontology to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 47809 edges from mondo-ontology to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and mondo-ontology: 1558\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and mondo-ontology: 167\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 15536 nodes from hp-ontology to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 19395 edges from hp-ontology to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and hp-ontology: 5617\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and hp-ontology: 0\n", + "[KGX][graph_merge.py][ add_all_nodes] INFO: Adding 3681 nodes from go-cams to STRING\n", + "[KGX][graph_merge.py][ add_all_edges] INFO: Adding 3724 edges from go-cams to STRING\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of nodes merged between STRING and go-cams: 237\n", + "[KGX][graph_merge.py][ merge_graphs] INFO: Number of edges merged between STRING and go-cams: 0\n", + "[KGX][cli_utils.py][ apply_operations] INFO: Applying operation kgx.operations.summarize_graph.generate_graph_stats with args: {'graph_name': 'KG-COVID-19 Graph', 'filename': 'merged_graph_stats.yaml', 'node_facet_properties': ['provided_by'], 'edge_facet_properties': ['provided_by']}\n", + "[KGX][cli_utils.py][ merge] INFO: Writing merged graph to merged-kg-tsv\n" + ] + } + ], + "source": [ + "!python run.py merge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The merged graph should be available in `data/merged/` folder.\n", + "\n", + "This pipeline generates a graph in KGX TSV format here:\n", + "`data/merged/merged-kg.tar.gz`\n", + "Prebuilt graphs are also available here:\n", + "https://kg-hub.berkeleybop.io/kg-covid-19/index.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Other tooling/functionality" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make training data for machine learning use case\n", + "\n", + "KG-COVID-19 contains tooling to produce training data for machine learning. Briefly, a training graph is produced with 80% (by default, override with `-t` parameter) of edges. 20% of edges are removed such that they do not create new components. These graphs are emitted as KGX TSV files in `data/holdouts`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### untar and gunzip the graph" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "merged-kg_nodes.tsv\n", + "merged-kg_edges.tsv\n" + ] + } + ], + "source": [ + "!tar -xvzf data/merged/merged-kg.tar.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### create the training/holdout data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:root:Loading graph from nodes merged-kg_nodes.tsv and edges merged-kg_edges.tsv files\n", + "INFO:root:Making positive edges\n", + "INFO:root:Making negative edges\n", + "INFO:root:Writing out positive edges\n", + "INFO:root:Writing out negative edges\n" + ] + } + ], + "source": [ + "!python run.py holdouts -e merged-kg_edges.tsv -n merged-kg_nodes.tsv # this might take 10 minutes or so" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Let's get some stats on our training graph. We're tightly integrated with ensmallen_graph, so we'll use that package to do this." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'degrees_max': '72381',\n", + " 'edges_number': '24761540',\n", + " 'degrees_min': '0',\n", + " 'connected_components_number': '24907',\n", + " 'degrees_mean': '65.5801068391348',\n", + " 'degrees_median': '5',\n", + " 'selfloops_rate': '0.0000155886911718738',\n", + " 'is_directed': 'false',\n", + " 'density': '0.00017368670983437763',\n", + " 'strongly_connected_components_number': '24907',\n", + " 'singleton_nodes': '23320',\n", + " 'unique_node_types_number': '37',\n", + " 'unique_edge_types_number': '33',\n", + " 'degrees_mode': '1',\n", + " 'traps_rate': '0.06176223657691014',\n", + " 'bidirectional_rate': '1',\n", + " 'nodes_number': '377577'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from ensmallen_graph import EnsmallenGraph\n", + "\n", + "training = EnsmallenGraph.from_csv(\n", + " edge_path=\"data/holdouts/pos_train_edges.tsv\",\n", + " sources_column='subject',\n", + " destinations_column='object',\n", + " directed=False,\n", + " edge_types_column='edge_label',\n", + " default_edge_type='biolink:Association',\n", + " node_path=\"data/holdouts/pos_train_nodes.tsv\",\n", + " nodes_column='id',\n", + " default_node_type='biolink:NamedThing',\n", + " node_types_column='category',\n", + " ignore_duplicated_edges=True,\n", + " ignore_duplicated_nodes=True,\n", + ");\n", + "\n", + "training.report()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'nodes_number': '377577',\n", + " 'is_directed': 'false',\n", + " 'density': '0.00021710748243868716',\n", + " 'degrees_min': '0',\n", + " 'traps_rate': '0.022019349695558788',\n", + " 'bidirectional_rate': '1',\n", + " 'degrees_mean': '81.97479189675218',\n", + " 'selfloops_rate': '0.00001554029368764255',\n", + " 'connected_components_number': '9068',\n", + " 'degrees_mode': '1',\n", + " 'singleton_nodes': '8314',\n", + " 'degrees_max': '90378',\n", + " 'edges_number': '30951796',\n", + " 'unique_node_types_number': '37',\n", + " 'unique_edge_types_number': '33',\n", + " 'strongly_connected_components_number': '9068',\n", + " 'degrees_median': '6'}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = EnsmallenGraph.from_csv(\n", + " edge_path=\"merged-kg_edges.tsv\",\n", + " sources_column='subject',\n", + " destinations_column='object',\n", + " directed=False,\n", + " edge_types_column='edge_label',\n", + " default_edge_type='biolink:Association',\n", + " node_path=\"merged-kg_nodes.tsv\",\n", + " nodes_column='id',\n", + " default_node_type='biolink:NamedThing',\n", + " node_types_column='category',\n", + " ignore_duplicated_edges=True,\n", + " ignore_duplicated_nodes=True,\n", + " force_conversion_to_undirected=True # deprecated, removed in ensmallen_graph 0.4\n", + ");\n", + "graph.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### See [these](https://github.com/monarch-initiative/embiggen/blob/master/notebooks/) notebook to generate embeddings from the KG you've created above. There are notebooks to make embeddings using:\n", + "- [Skipgram](https://github.com/monarch-initiative/embiggen/blob/master/notebooks/Graph%20embedding%20using%20SkipGram.ipynb)\n", + "- [CBOW](https://github.com/monarch-initiative/embiggen/blob/master/notebooks/Graph%20embedding%20using%20CBOW.ipynb)\n", + "- [GloVe](https://github.com/monarch-initiative/embiggen/blob/master/notebooks/Graph%20embedding%20using%20GloVe.ipynb)\n", + "\n", + "#### These embeddings can then be used to train MLP, random forest, decision tree, and logistic regression classifiers using [this notebook](https://github.com/monarch-initiative/embiggen/blob/master/notebooks/Classical%20Link%20Prediction.ipynb).\n", + "\n", + "##### Note: consider running the code in these notebooks on a server with GPUs in order to complete in a reasonable amount of time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use prebuilt SPARQL queries to query our Blazegraph endpoint on the commandline\n", + "\n", + "KG-COVID-19 has tooling to query our Blazegraph endpoint using predetermined SPARQL queries, and emit the results as a TSV file. Different SPARQL queries on our endpoint or other endpoints can be used by creating a new YAML file and specifying this file with the `-y` flag. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/reeseju/kg-covid-19/kg_covid_19/query.py:17: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\r\n", + " return yaml.load(open(yaml_file))\r\n" + ] + } + ], + "source": [ + "!python run.py query -y queries/sparql/query-01-bl-cat-counts.yaml # or make a new YAML file and write your own query" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['v1', 'v0']\n", + "['199', 'organism taxon']\n", + "['19131', 'https://w3id.org/biolink/vocab/Gene']\n", + "['3908', 'https://w3id.org/biolink/vocab/NamedThing']\n", + "['20167', 'https://w3id.org/biolink/vocab/Protein']\n", + "['30534', 'https://w3id.org/biolink/vocab/BiologicalProcess']\n", + "['4468', 'https://w3id.org/biolink/vocab/CellularComponent']\n", + "['30018', 'https://w3id.org/biolink/vocab/ChemicalSubstance']\n", + "['32228', 'https://w3id.org/biolink/vocab/Drug']\n", + "['12241', 'https://w3id.org/biolink/vocab/MolecularActivity']\n", + "['62446', 'https://w3id.org/biolink/vocab/OntologyClass']\n", + "['6', 'https://w3id.org/biolink/vocab/OrganismalEntity']\n", + "['15530', 'https://w3id.org/biolink/vocab/PhenotypicFeature']\n", + "['129930', 'https://w3id.org/biolink/vocab/Publication']\n", + "['4687', 'https://w3id.org/biolink/vocab/AnatomicalEntity']\n", + "['48', 'https://w3id.org/biolink/vocab/Assay']\n", + "['703', 'https://w3id.org/biolink/vocab/Cell']\n", + "['24229', 'https://w3id.org/biolink/vocab/Disease']\n", + "['1', 'https://w3id.org/biolink/vocab/MolecularEntity']\n", + "['17', 'https://w3id.org/biolink/vocab/RNA']\n", + "['47', 'https://w3id.org/biolink/vocab/SequenceFeature']\n" + ] + } + ], + "source": [ + "# have a look at biolink category counts currently in KG-COVID-19 loaded on Blazegraph endpoint\n", + "import csv\n", + "\n", + "with open('data/queries/query-01-bl-cat-counts.tsv', newline='') as tsv:\n", + " read_tsv = csv.reader(tsv, delimiter=\"\\t\")\n", + " for row in read_tsv:\n", + " print(row)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}