Merge 4afc529 into c1fac28

Knowledge-Graph-Hub · Jun 17, 2020 · 4b0bd2f · 4b0bd2f
2 parents c1fac28 + 4afc529
commit 4b0bd2f
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 8 deletions.
diff --git a/download.yaml b/download.yaml
@@ -54,6 +54,13 @@
   url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   local_name: gene_info.gz
 
+-
+  # this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
+  # to be ID mapped in merge step
+  # nb: we are also downloading and using this file for the TTD transform
+  url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
+  local_name: HUMAN_9606_idmapping.dat.gz
+
 #
 # TTD - Therapeutic Targets Database
 # drug targets, and associated data for each (drugs, ids, etc)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -1,12 +1,12 @@
 import gzip
+import logging
 import os
 import compress_json  # type: ignore
-from typing import Dict, List, Any, Set, Optional
+from typing import Dict, List, Any, Set, Optional, IO
 
 from kg_covid_19.transform_utils.transform import Transform
-from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority
-
-from encodeproject import download as encode_download  # type: ignore
+from kg_covid_19.utils.transform_utils import write_node_edge_item, \
+    get_item_by_priority, uniprot_make_name_to_id_mapping
 
 """
 Ingest protein-protein interactions from STRING DB.
@@ -24,12 +24,20 @@
 Edge: 
 subject edge_label  object  relation
 protein:1234    interacts_with  protein:4567    RO:0002434
+
+Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
+subject edge_label  object  relation
+protein:1234    bl:xrefs  protein:4567    RO:0002434
+
 """
 
 NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
 PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
 GENE_INFO_FILE = 'gene_info.gz'
 
+# make name to id map for uniprot names of human proteins
+UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"
+
 
 class StringTransform(Transform):
     """
@@ -43,7 +51,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
         self.protein_gene_map: Dict[str, Any] = {}
         self.gene_info_map: Dict[str, Any] = {}
         self.ensembl2ncbi_map: Dict[str, Any] = {}
+        logging.info("Loading Ensembl Gene to Protein mapping")
         self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
+        logging.info("Load mappings from NCBI gene_info")
         self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])
 
     def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
@@ -141,7 +151,11 @@ def run(self, data_file: Optional[str] = None) -> None:
 
         # Required to align the node edge header of the gene
         # with the default header
-        extra_header = [""]*(len(edge_additional_headers)+1)
+        self.extra_header = [""]*(len(edge_additional_headers)+1)
+
+        # make string ENSP to Uniprot id mapping dict
+        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
+            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))
 
         with open(self.output_node_file, 'w') as node, \
                 open(self.output_edge_file, 'w') as edge, \
@@ -155,9 +169,10 @@ def run(self, data_file: Optional[str] = None) -> None:
                 items_dict = parse_stringdb_interactions(line, header_items)
                 proteins = []
                 for protein_name in ('protein1', 'protein2'):
-                    protein = get_item_by_priority(items_dict, [protein_name])
-                    protein = '.'.join(protein.split('.')[1:])
+                    nat_string_id = get_item_by_priority(items_dict, [protein_name])
+                    protein = '.'.join(nat_string_id.split('.')[1:])
                     proteins.append(protein)
+
                     if protein in self.protein_gene_map:
                         gene = self.protein_gene_map[protein]
                         if gene not in seen_genes:
@@ -185,7 +200,7 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     f"ENSEMBL:{protein}",
                                     "RO:0002205",
                                     "NCBI",
-                                ] + extra_header
+                                ] + self.extra_header
                             )
 
                         # write node data
@@ -198,6 +213,26 @@ def run(self, data_file: Optional[str] = None) -> None:
                                       protein_node_type, "", "", self.source_name]
                             )
 
+                    # if we have an equivalent Uniprot ID for this Ensembl protein
+                    # ID make an xref edge, and a node for the Uniprot ID
+                    if nat_string_id in string_to_uniprot_id_map:
+                        uniprot_curie = \
+                            f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
+                        write_node_edge_item(
+                            fh=node,
+                            header=self.node_header,
+                            data=[uniprot_curie, "",
+                                  protein_node_type, "", "", self.source_name])
+                        write_node_edge_item(
+                            fh=edge,
+                            header=self.edge_header,
+                            data=[f"ENSEMBL:{protein}",
+                                  "biolink:xrefs",
+                                  uniprot_curie,
+                                  "biolink:xrefs",
+                                  "uniprot",
+                                  ] + self.extra_header)
+
                 # write edge data
                 write_node_edge_item(
                     fh=edge,
@@ -210,6 +245,7 @@ def run(self, data_file: Optional[str] = None) -> None:
                 )
 
 
+
 def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict:
     """Methods processes a line of text from Drug Central.
 
@@ -240,3 +276,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
     header = header_string.strip().split(sep)
 
     return [i.replace('"', '') for i in header]
+
diff --git a/tests/resources/string/9606.protein.links.full.v11.0.txt.gz b/tests/resources/string/9606.protein.links.full.v11.0.txt.gz
diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz
diff --git a/tests/resources/string/gene2ensembl.gz b/tests/resources/string/gene2ensembl.gz
diff --git a/tests/resources/string/gene_info.gz b/tests/resources/string/gene_info.gz
diff --git a/tests/test_string.py b/tests/test_string.py
@@ -0,0 +1,59 @@
+import os
+import tempfile
+from unittest import TestCase, skip
+
+from parameterized import parameterized
+
+from kg_covid_19.transform_utils.string_ppi import StringTransform
+
+
+class TestString(TestCase):
+    """Tests the string ingest"""
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.input_dir = "tests/resources/string/"
+        cls.output_dir = tempfile.gettempdir()
+        cls.string_output_dir = os.path.join(cls.output_dir, "STRING")
+        cls.string = StringTransform(cls.input_dir, cls.output_dir)
+
+    def setUp(self) -> None:
+        pass
+
+    @parameterized.expand([
+    ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
+    ['gene_info_map', dict, '1',
+     {'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
+      'description': 'alpha-1-B glycoprotein'}],
+    ['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
+    ])
+    def test_instance_vars(self, variable, type, key, val):
+        this_var = getattr(self.string, variable)
+        self.assertTrue(isinstance(this_var, type))
+        self.assertTrue(key in this_var)
+        self.assertTrue(this_var[key], val)
+
+    def test_output_dir(self):
+        self.assertEqual(self.string.output_dir, self.string_output_dir)
+
+    def test_input_dir(self):
+        self.assertEqual(self.string.input_base_dir, self.input_dir)
+
+    def test_output_edge_file(self):
+        self.assertEqual(self.string.output_edge_file,
+                         os.path.join(self.string_output_dir, "edges.tsv"))
+
+    def test_output_node_file(self):
+        self.assertEqual(self.string.output_node_file,
+                         os.path.join(self.string_output_dir, "nodes.tsv"))
+
+    def test_source_name(self):
+        self.assertEqual(self.string.source_name, 'STRING')
+
+    def test_run(self):
+        self.assertTrue(isinstance(self.string.run, object))
+        self.string.run()
+        self.assertTrue(os.path.isdir(self.string_output_dir))
+        self.assertTrue(
+            os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv")))
+        self.assertTrue(
+            os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv")))