Merge branch 'master' of https://github.com/Knowledge-Graph-Hub/kg-co…

…vid-19 into include_yaml_manifest_in_tar
Knowledge-Graph-Hub · Jul 10, 2020 · e79bcaa · e79bcaa
2 parents 9927015 + 6dfa60d
commit e79bcaa
Show file tree

Hide file tree

Showing 15 changed files with 160 additions and 99 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -44,6 +44,7 @@ pipeline {
                     )
                     sh '/usr/bin/python3.7 -m venv venv'
                     sh '. venv/bin/activate'
+                    sh './venv/bin/pip install wheel'
                     sh './venv/bin/pip install bmt'
                     sh './venv/bin/pip install -r requirements.txt'
                     sh './venv/bin/python setup.py install'

diff --git a/README.rst b/README.rst
@@ -10,25 +10,6 @@ Documentation
 
 See the `repository's wiki <https://github.com/kg-emerging-viruses/kg-emerging-viruses/wiki>`_.
 
-How do I install this package?
-----------------------------------------------
-
-.. code:: shell
-
-    git clone https://github.com/Knowledge-Graph-Hub/kg-covid-19
-    cd kg-covid-19
-    pip install .
-    pip install -r requirements.txt
-
-How do I use this package?
-----------------------------------------------
-
-.. code:: shell
-
-    python run.py download
-    python run.py transform
-    python run.py merge
-
 
 Tests Coverage
 ----------------------------------------------

diff --git a/curated/ORFs/uniprot_sars-cov-2.gpi b/curated/ORFs/uniprot_sars-cov-2.gpi
diff --git a/download.yaml b/download.yaml
@@ -75,11 +75,15 @@
   local_name: HUMAN_9606_idmapping.dat.gz
 
 #
-# SciBite CORD-19 annotations v1.4
+# SciBite CORD-19 annotations v1.5
 #
 -
-  url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.4/CORD-19_1_4.zip
-  local_name: CORD-19_1_4.zip
+  url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.5/pdf_json.zip
+  local_name: pdf_json.zip
+
+-
+  url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.5/pmc_json.zip
+  local_name: pmc_json.zip
 
  # SciBite CORD-19 entity co-occurrences v1.2
 -

diff --git a/kg_covid_19/merge_utils/merge_kg.py b/kg_covid_19/merge_utils/merge_kg.py
@@ -81,7 +81,7 @@ def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
     # merge all subgraphs into a single graph
     merged_graph = merge_all_graphs([x.graph for x in transformers])
     merged_graph.name = 'merged_graph'
-    generate_graph_stats(merged_graph, merged_graph.name, f"merged_graph_stats.yaml")
+    generate_graph_stats(merged_graph, merged_graph.name, "merged_graph_stats.yaml", ['provided_by'], ['provided_by'])
 
     # write the merged graph
     if 'destination' in config:
@@ -96,7 +96,7 @@ def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
                 destination_transformer.save()
             elif destination['type'] in get_file_types():
                 destination_transformer = get_transformer(destination['type'])(merged_graph)
-                destination_transformer.save(destination['filename'], extension=destination['type'])
+                destination_transformer.save(destination['filename'], output_format=destination['type'])
             else:
                 logging.error("type {} not yet supported for KGX load-and-merge operation.".format(destination['type']))
 

diff --git a/kg_covid_19/transform_utils/chembl/chembl_transform.py b/kg_covid_19/transform_utils/chembl/chembl_transform.py
@@ -162,6 +162,7 @@ def parse_chembl_molecules(self, data: List):
         remap = {
             'pref_name': 'name',
             'full_molformula': 'molecular_formula',
+            'synonyms': 'synonym'
         }
         self._node_header.update([remap[x] if x in remap else x for x in allowed_properties])
 

diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py
@@ -56,4 +56,4 @@ def parse(self, name: str, data_file: str, source: str) -> None:
         transformer = ObographJsonTransformer()
         transformer.parse(data_file, provided_by=source)
         output_transformer = PandasTransformer(transformer.graph)
-        output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), extension='tsv', mode=None)
+        output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
diff --git a/kg_covid_19/transform_utils/scibite_cord/scibite_cord.py b/kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
@@ -41,49 +41,59 @@ def run(self, data_file: Optional[str] = None) -> None:
         Args:
             data_file: data file to parse
 
+            Should be:
+            [pdf_json.zip, pmc_json.zip, cv19_scc_1_2.zip]
+
         Returns:
             None.
 
         """
         data_files = list()
         if not data_file:
-            data_files.append(os.path.join(self.input_base_dir, "CORD-19_1_4.zip"))
+            data_files.append(os.path.join(self.input_base_dir, "pdf_json.zip"))
+            data_files.append(os.path.join(self.input_base_dir, "pmc_json.zip"))
+
             data_files.append(os.path.join(self.input_base_dir, "cv19_scc_1_2.zip"))
         else:
-            data_files.append(data_file)
+            data_files.extend(data_files)
 
         self.node_header = ['id', 'name', 'category', 'description', 'provided_by']
         self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
         node_handle = open(self.output_node_file, 'w')
         edge_handle = open(self.output_edge_file, 'w')
         node_handle.write("\t".join(self.node_header) + "\n")
         edge_handle.write("\t".join(self.edge_header) + "\n")
-        self.parse_annotations(node_handle, edge_handle, data_files[0])
+        self.parse_annotations(node_handle, edge_handle, data_files[0], data_files[1])
 
         node_handle = open(os.path.join(self.output_dir, "entity_cooccurrence_nodes.tsv"), 'w')
         edge_handle = open(os.path.join(self.output_dir, "entity_cooccurrence_edges.tsv"), 'w')
         node_handle.write("\t".join(self.node_header) + "\n")
         edge_handle.write("\t".join(self.edge_header) + "\n")
-        self.parse_cooccurrence(node_handle, edge_handle, data_files[1])
+        self.parse_cooccurrence(node_handle, edge_handle, data_files[2])
 
-    def parse_annotations(self, node_handle: Any, edge_handle: Any, data_file: str) -> None:
-        """Parse annotations from CORD-19_1_2.zip.
+    def parse_annotations(self, node_handle: Any, edge_handle: Any,
+                          data_file1: str,
+                          data_file2: str) -> None:
+        """Parse annotations from CORD-19_1_5.zip.
 
         Args:
             node_handle: File handle for nodes.csv.
             edge_handle: File handle for edges.csv.
-            data_file: Path to CORD-19_1_2.zip.
+            data_file1: Path to first CORD-19_1_5.zip.
+            data_file2: Path to second CORD-19_1_5.zip.
 
         Returns:
              None.
 
         """
-        with ZipFile(data_file, 'r') as ZF:
+        with ZipFile(data_file1, 'r') as ZF:
+            ZF.extractall(path=self.input_base_dir)
+        with ZipFile(data_file2, 'r') as ZF:
             ZF.extractall(path=self.input_base_dir)
 
-        subsets = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'custom_license']
+        subsets = ['pmc_json', 'pdf_json']
         for subset in subsets:
-            subset_dir = os.path.join(self.input_base_dir, 'CORD19', subset, subset)
+            subset_dir = os.path.join(self.input_base_dir, subset)
             for data_dir in os.listdir(subset_dir):
                 if os.path.isdir(os.path.join(subset_dir, data_dir)):
                     for filename in os.listdir(os.path.join(subset_dir, data_dir)):
@@ -126,8 +136,6 @@ def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str
                 terms.update(self.extract_termite_hits(x))
 
         provided_by = f"{self.source_name}"
-        if subset:
-            provided_by += f" {subset}"
 
         # add a biolink:Publication for each paper
         write_node_edge_item(

diff --git a/kg_covid_19/transform_utils/string_ppi/node_header.json b/kg_covid_19/transform_utils/string_ppi/node_header.json
@@ -3,7 +3,6 @@
     "name",
     "category",
     "description",
-    "alias",
     "xrefs",
     "provided_by"
 ]
diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -6,7 +6,7 @@
 
 from kg_covid_19.transform_utils.transform import Transform
 from kg_covid_19.utils.transform_utils import write_node_edge_item, \
-    get_item_by_priority, uniprot_make_name_to_id_mapping
+    get_item_by_priority, uniprot_make_name_to_id_mapping, collapse_uniprot_curie
 
 """
 Ingest protein-protein interactions from STRING DB.
@@ -187,7 +187,6 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     'biolink:Gene',
                                     gene_informations['description'],
                                     f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
-                                    "",
                                     self.source_name
                                 ]
                             )
@@ -213,14 +212,14 @@ def run(self, data_file: Optional[str] = None) -> None:
                         if protein in string_to_uniprot_id_map:
                             uniprot_curie = \
                                 f"UniProtKB:{string_to_uniprot_id_map[protein]}"
+                            uniprot_curie = collapse_uniprot_curie(uniprot_curie)
 
                         write_node_edge_item(
                             fh=node,
                             header=self.node_header,
                             data=[f"ENSEMBL:{protein}", "",
                                   protein_node_type,
                                   "",
-                                  "",
                                   uniprot_curie,  # xref
                                   self.source_name]
                         )

diff --git a/kg_covid_19/utils/transform_utils.py b/kg_covid_19/utils/transform_utils.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 import gzip
 import logging
+import re
 import zipfile
 from typing import Any, Dict, List, Union
 from tqdm import tqdm  # type: ignore
@@ -176,3 +177,16 @@ def guess_bl_category(identifier: str) -> str:
     else:
         category = 'biolink:NamedThing'
     return category
+
+
+def collapse_uniprot_curie(uniprot_curie: str) -> str:
+    """ Given a UniProtKB curie for an isoform such as UniprotKB:P63151-1
+    or UniprotKB:P63151-2, collapse to parent protein
+    (UniprotKB:P63151 / UniprotKB:P63151)
+
+    :param uniprot_curie:
+    :return: collapsed UniProtKB ID
+    """
+    if re.match(r'^uniprotkb:', uniprot_curie, re.IGNORECASE):
+        uniprot_curie = re.sub(r'\-\d+$', '', uniprot_curie)
+    return uniprot_curie
diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz
diff --git a/tests/test_gpi_file.py b/tests/test_gpi_file.py
@@ -0,0 +1,21 @@
+import os
+import unittest
+
+from kg_covid_19.transform_utils.sars_cov_2_gene_annot.sars_cov_2_gene_annot import \
+    _gpi12iterator
+
+
+class TestGpiFile(unittest.TestCase):
+    def setUp(self) -> None:
+        self.gpi_file = 'curated/ORFs/uniprot_sars-cov-2.gpi'
+        self.expected_sars_cov2_genes = 32
+
+    def test_gpi_file_exists(self):
+        self.assertTrue(os.path.exists(self.gpi_file))
+
+    def test_gpi_parsing(self):
+        count: int = 0
+        with open(self.gpi_file, 'r') as gpi_fh:
+            for rec in _gpi12iterator(gpi_fh):
+                count += 1
+        self.assertEqual(self.expected_sars_cov2_genes, count)
diff --git a/tests/test_string.py b/tests/test_string.py
@@ -56,8 +56,8 @@ def test_nodes_file(self):
         node_file = os.path.join(self.string_output_dir, "nodes.tsv")
         self.assertTrue(os.path.isfile(node_file))
         node_df = pd.read_csv(node_file, sep="\t", header=0)
-        self.assertEqual((10, 7), node_df.shape)
-        self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
+        self.assertEqual((10, 6), node_df.shape)
+        self.assertEqual(['id', 'name', 'category', 'description', 'xrefs',
                           'provided_by'], list(node_df.columns))
         self.assertCountEqual(['ENSEMBL:ENSP00000000233',
                               'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
@@ -66,7 +66,7 @@ def test_nodes_file(self):
                               'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
                               'ENSEMBL:ENSP00000371253'],
                              list(node_df.id.unique()))
-        self.assertEqual('UniProtKB:P84085',
+        self.assertEqual('UniProtKB:P84085',  # isoform (-2) stripped off
                               node_df.loc[node_df['id'] ==
                                           'ENSEMBL:ENSP00000000233'].xrefs.item())
 

diff --git a/tests/test_transform_utils.py b/tests/test_transform_utils.py
@@ -1,6 +1,6 @@
 import unittest
 from parameterized import parameterized
-from kg_covid_19.utils.transform_utils import guess_bl_category
+from kg_covid_19.utils.transform_utils import guess_bl_category, collapse_uniprot_curie
 
 
 class TestTransformUtils(unittest.TestCase):
@@ -13,5 +13,13 @@ class TestTransformUtils(unittest.TestCase):
     def test_guess_bl_category(self, curie, category):
         self.assertEqual(category, guess_bl_category(curie))
 
-
+    @parameterized.expand([
+        ['foobar', 'foobar'],
+        ['ENSEMBL:ENSG00000178607', 'ENSEMBL:ENSG00000178607'],
+        ['UniprotKB:P63151-1', 'UniprotKB:P63151'],
+        ['uniprotkb:P63151-1', 'uniprotkb:P63151'],
+        ['UniprotKB:P63151-2', 'UniprotKB:P63151'],
+    ])
+    def test_collapse_uniprot_curie(self, curie, collapsed_curie):
+        self.assertEqual(collapsed_curie, collapse_uniprot_curie(curie))