Merge pull request #237 from Knowledge-Graph-Hub/issue_235_string_id_…

…norm Make xrefs between Ensembl IDs and Uniprot IDs in STRING ingest
Knowledge-Graph-Hub · Jun 18, 2020 · e6d1f87 · e6d1f87
2 parents d24e6d8 + cf1672d
commit e6d1f87
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 20 deletions.
diff --git a/download.yaml b/download.yaml
@@ -54,6 +54,13 @@
   url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   local_name: gene_info.gz
 
+-
+  # this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
+  # to be ID mapped in merge step
+  # nb: we are also downloading and using this file for the TTD transform
+  url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
+  local_name: HUMAN_9606_idmapping.dat.gz
+
 #
 # TTD - Therapeutic Targets Database
 # drug targets, and associated data for each (drugs, ids, etc)

diff --git a/kg_covid_19/transform_utils/string_ppi/node_header.json b/kg_covid_19/transform_utils/string_ppi/node_header.json
@@ -4,5 +4,6 @@
     "category",
     "description",
     "alias",
+    "xrefs",
     "provided_by"
-]
+]
diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -1,12 +1,12 @@
 import gzip
+import logging
 import os
 import compress_json  # type: ignore
-from typing import Dict, List, Any, Set, Optional
+from typing import Dict, List, Any, Set, Optional, IO
 
 from kg_covid_19.transform_utils.transform import Transform
-from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority
-
-from encodeproject import download as encode_download  # type: ignore
+from kg_covid_19.utils.transform_utils import write_node_edge_item, \
+    get_item_by_priority, uniprot_make_name_to_id_mapping
 
 """
 Ingest protein-protein interactions from STRING DB.
@@ -18,18 +18,25 @@
 Write node and edge headers that look something like:
 
 Node: 
-id  name    category
-protein:1234    TBX4    Protein 
+id  name    category    xrefs   provided_by
+protein:1234    TBX4    biolink:Protein UniprotKB:123456    STRING 
+
+xrefs contains the UniprotKB id for the protein, if available
 
 Edge: 
 subject edge_label  object  relation
 protein:1234    interacts_with  protein:4567    RO:0002434
+
+
 """
 
 NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
 PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
 GENE_INFO_FILE = 'gene_info.gz'
 
+# make name to id map for uniprot names of human proteins
+UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"
+
 
 class StringTransform(Transform):
     """
@@ -43,7 +50,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
         self.protein_gene_map: Dict[str, Any] = {}
         self.gene_info_map: Dict[str, Any] = {}
         self.ensembl2ncbi_map: Dict[str, Any] = {}
+        logging.info("Loading Ensembl Gene to Protein mapping")
         self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
+        logging.info("Load mappings from NCBI gene_info")
         self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])
 
     def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
@@ -141,7 +150,11 @@ def run(self, data_file: Optional[str] = None) -> None:
 
         # Required to align the node edge header of the gene
         # with the default header
-        extra_header = [""]*(len(edge_additional_headers)+1)
+        self.extra_header = [""]*(len(edge_additional_headers)+1)
+
+        # make string ENSP to Uniprot id mapping dict
+        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
+            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))
 
         with open(self.output_node_file, 'w') as node, \
                 open(self.output_edge_file, 'w') as edge, \
@@ -155,9 +168,10 @@ def run(self, data_file: Optional[str] = None) -> None:
                 items_dict = parse_stringdb_interactions(line, header_items)
                 proteins = []
                 for protein_name in ('protein1', 'protein2'):
-                    protein = get_item_by_priority(items_dict, [protein_name])
-                    protein = '.'.join(protein.split('.')[1:])
+                    nat_string_id = get_item_by_priority(items_dict, [protein_name])
+                    protein = '.'.join(nat_string_id.split('.')[1:])
                     proteins.append(protein)
+
                     if protein in self.protein_gene_map:
                         gene = self.protein_gene_map[protein]
                         if gene not in seen_genes:
@@ -173,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     'biolink:Gene',
                                     gene_informations['description'],
                                     f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
+                                    "",
                                     self.source_name
                                 ]
                             )
@@ -185,18 +200,31 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     f"ENSEMBL:{protein}",
                                     "RO:0002205",
                                     "NCBI",
-                                ] + extra_header
+                                ] + self.extra_header
                             )
 
-                        # write node data
-                        if protein not in seen_proteins:
-                            seen_proteins.add(protein)
-                            write_node_edge_item(
-                                fh=node,
-                                header=self.node_header,
-                                data=[f"ENSEMBL:{protein}", "",
-                                      protein_node_type, "", "", self.source_name]
-                            )
+                    # write node data
+                    if protein not in seen_proteins:
+                        seen_proteins.add(protein)
+
+                        # if we have an equivalent Uniprot ID for this Ensembl protein
+                        # ID make an xref edge, and a node for the Uniprot ID
+                        uniprot_curie = ''
+                        if protein in string_to_uniprot_id_map:
+                            uniprot_curie = \
+                                f"UniprotKB:{string_to_uniprot_id_map[protein]}"
+
+                        write_node_edge_item(
+                            fh=node,
+                            header=self.node_header,
+                            data=[f"ENSEMBL:{protein}", "",
+                                  protein_node_type,
+                                  "",
+                                  "",
+                                  uniprot_curie,  # xref
+                                  self.source_name]
+                        )
+
 
                 # write edge data
                 write_node_edge_item(
@@ -240,3 +268,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
     header = header_string.strip().split(sep)
 
     return [i.replace('"', '') for i in header]
+
diff --git a/tests/resources/string/9606.protein.links.full.v11.0.txt.gz b/tests/resources/string/9606.protein.links.full.v11.0.txt.gz
diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz
diff --git a/tests/resources/string/gene2ensembl.gz b/tests/resources/string/gene2ensembl.gz
diff --git a/tests/resources/string/gene_info.gz b/tests/resources/string/gene_info.gz
diff --git a/tests/test_string.py b/tests/test_string.py
@@ -0,0 +1,85 @@
+import os
+import tempfile
+import pandas as pd
+from unittest import TestCase, skip
+
+from parameterized import parameterized
+
+from kg_covid_19.transform_utils.string_ppi import StringTransform
+
+
+class TestString(TestCase):
+    """Tests the string ingest"""
+    def setUp(self) -> None:
+        self.input_dir = "tests/resources/string/"
+        self.output_dir = tempfile.gettempdir()
+        self.string_output_dir = os.path.join(self.output_dir, "STRING")
+        self.string = StringTransform(self.input_dir, self.output_dir)
+
+    @parameterized.expand([
+    ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
+    ['gene_info_map', dict, '1',
+     {'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
+      'description': 'alpha-1-B glycoprotein'}],
+    ['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
+    ])
+    def test_instance_vars(self, variable, type, key, val):
+        this_var = getattr(self.string, variable)
+        self.assertTrue(isinstance(this_var, type))
+        self.assertTrue(key in this_var)
+        self.assertTrue(this_var[key], val)
+
+    def test_output_dir(self):
+        self.assertEqual(self.string.output_dir, self.string_output_dir)
+
+    def test_input_dir(self):
+        self.assertEqual(self.string.input_base_dir, self.input_dir)
+
+    def test_output_edge_file(self):
+        self.assertEqual(self.string.output_edge_file,
+                         os.path.join(self.string_output_dir, "edges.tsv"))
+
+    def test_output_node_file(self):
+        self.assertEqual(self.string.output_node_file,
+                         os.path.join(self.string_output_dir, "nodes.tsv"))
+
+    def test_source_name(self):
+        self.assertEqual(self.string.source_name, 'STRING')
+
+    def test_run(self):
+        self.assertTrue(isinstance(self.string.run, object))
+        self.string.run()
+        self.assertTrue(os.path.isdir(self.string_output_dir))
+
+    def test_nodes_file(self):
+        self.string.run()
+        node_file = os.path.join(self.string_output_dir, "nodes.tsv")
+        self.assertTrue(os.path.isfile(node_file))
+        node_df = pd.read_csv(node_file, sep="\t", header=0)
+        self.assertEqual((10, 7), node_df.shape)
+        self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
+                          'provided_by'], list(node_df.columns))
+        self.assertCountEqual(['ENSEMBL:ENSP00000000233',
+                              'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
+                              'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
+                              'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
+                              'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
+                              'ENSEMBL:ENSP00000371253'],
+                             list(node_df.id.unique()))
+        self.assertCountEqual('UniprotKB:P84085',
+                              node_df.loc[node_df['id'] ==
+                                          'ENSEMBL:ENSP00000000233'].xrefs.item())
+
+    def test_edges_file(self):
+        self.string.run()
+        edge_file = os.path.join(self.string_output_dir, "edges.tsv")
+        self.assertTrue(os.path.isfile(edge_file))
+        edge_df = pd.read_csv(edge_file, sep="\t", header=0)
+        self.assertEqual((9, 19), edge_df.shape)
+        self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
+                          'combined_score', 'neighborhood', 'neighborhood_transferred',
+                          'fusion', 'cooccurence', 'homology', 'coexpression',
+                          'coexpression_transferred', 'experiments',
+                          'experiments_transferred', 'database', 'database_transferred',
+                          'textmining', 'textmining_transferred', ],
+                         list(edge_df.columns))