Skip to content

Commit

Permalink
Merge pull request #237 from Knowledge-Graph-Hub/issue_235_string_id_…
Browse files Browse the repository at this point in the history
…norm

Make xrefs between Ensembl IDs and Uniprot IDs in STRING ingest
  • Loading branch information
deepakunni3 committed Jun 18, 2020
2 parents d24e6d8 + cf1672d commit e6d1f87
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 20 deletions.
7 changes: 7 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
local_name: gene_info.gz

-
# this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
# to be ID mapped in merge step
# nb: we are also downloading and using this file for the TTD transform
url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
local_name: HUMAN_9606_idmapping.dat.gz

#
# TTD - Therapeutic Targets Database
# drug targets, and associated data for each (drugs, ids, etc)
Expand Down
3 changes: 2 additions & 1 deletion kg_covid_19/transform_utils/string_ppi/node_header.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"category",
"description",
"alias",
"xrefs",
"provided_by"
]
]
67 changes: 48 additions & 19 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import gzip
import logging
import os
import compress_json # type: ignore
from typing import Dict, List, Any, Set, Optional
from typing import Dict, List, Any, Set, Optional, IO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority

from encodeproject import download as encode_download # type: ignore
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, uniprot_make_name_to_id_mapping

"""
Ingest protein-protein interactions from STRING DB.
Expand All @@ -18,18 +18,25 @@
Write node and edge headers that look something like:
Node:
id name category
protein:1234 TBX4 Protein
id name category xrefs provided_by
protein:1234 TBX4 biolink:Protein UniprotKB:123456 STRING
xrefs contains the UniprotKB id for the protein, if available
Edge:
subject edge_label object relation
protein:1234 interacts_with protein:4567 RO:0002434
"""

NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
GENE_INFO_FILE = 'gene_info.gz'

# make name to id map for uniprot names of human proteins
UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"


class StringTransform(Transform):
"""
Expand All @@ -43,7 +50,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.protein_gene_map: Dict[str, Any] = {}
self.gene_info_map: Dict[str, Any] = {}
self.ensembl2ncbi_map: Dict[str, Any] = {}
logging.info("Loading Ensembl Gene to Protein mapping")
self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
logging.info("Load mappings from NCBI gene_info")
self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])

def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
Expand Down Expand Up @@ -141,7 +150,11 @@ def run(self, data_file: Optional[str] = None) -> None:

# Required to align the node edge header of the gene
# with the default header
extra_header = [""]*(len(edge_additional_headers)+1)
self.extra_header = [""]*(len(edge_additional_headers)+1)

# make string ENSP to Uniprot id mapping dict
string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
Expand All @@ -155,9 +168,10 @@ def run(self, data_file: Optional[str] = None) -> None:
items_dict = parse_stringdb_interactions(line, header_items)
proteins = []
for protein_name in ('protein1', 'protein2'):
protein = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(protein.split('.')[1:])
nat_string_id = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(nat_string_id.split('.')[1:])
proteins.append(protein)

if protein in self.protein_gene_map:
gene = self.protein_gene_map[protein]
if gene not in seen_genes:
Expand All @@ -173,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None:
'biolink:Gene',
gene_informations['description'],
f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
"",
self.source_name
]
)
Expand All @@ -185,18 +200,31 @@ def run(self, data_file: Optional[str] = None) -> None:
f"ENSEMBL:{protein}",
"RO:0002205",
"NCBI",
] + extra_header
] + self.extra_header
)

# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)
write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type, "", "", self.source_name]
)
# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)

# if we have an equivalent Uniprot ID for this Ensembl protein
# ID make an xref edge, and a node for the Uniprot ID
uniprot_curie = ''
if protein in string_to_uniprot_id_map:
uniprot_curie = \
f"UniprotKB:{string_to_uniprot_id_map[protein]}"

write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type,
"",
"",
uniprot_curie, # xref
self.source_name]
)


# write edge data
write_node_edge_item(
Expand Down Expand Up @@ -240,3 +268,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
header = header_string.strip().split(sep)

return [i.replace('"', '') for i in header]

Binary file not shown.
Binary file added tests/resources/string/HUMAN_9606_idmapping.dat.gz
Binary file not shown.
Binary file added tests/resources/string/gene2ensembl.gz
Binary file not shown.
Binary file added tests/resources/string/gene_info.gz
Binary file not shown.
85 changes: 85 additions & 0 deletions tests/test_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import tempfile
import pandas as pd
from unittest import TestCase, skip

from parameterized import parameterized

from kg_covid_19.transform_utils.string_ppi import StringTransform


class TestString(TestCase):
"""Tests the string ingest"""
def setUp(self) -> None:
self.input_dir = "tests/resources/string/"
self.output_dir = tempfile.gettempdir()
self.string_output_dir = os.path.join(self.output_dir, "STRING")
self.string = StringTransform(self.input_dir, self.output_dir)

@parameterized.expand([
['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
['gene_info_map', dict, '1',
{'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
'description': 'alpha-1-B glycoprotein'}],
['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
])
def test_instance_vars(self, variable, type, key, val):
this_var = getattr(self.string, variable)
self.assertTrue(isinstance(this_var, type))
self.assertTrue(key in this_var)
self.assertTrue(this_var[key], val)

def test_output_dir(self):
self.assertEqual(self.string.output_dir, self.string_output_dir)

def test_input_dir(self):
self.assertEqual(self.string.input_base_dir, self.input_dir)

def test_output_edge_file(self):
self.assertEqual(self.string.output_edge_file,
os.path.join(self.string_output_dir, "edges.tsv"))

def test_output_node_file(self):
self.assertEqual(self.string.output_node_file,
os.path.join(self.string_output_dir, "nodes.tsv"))

def test_source_name(self):
self.assertEqual(self.string.source_name, 'STRING')

def test_run(self):
self.assertTrue(isinstance(self.string.run, object))
self.string.run()
self.assertTrue(os.path.isdir(self.string_output_dir))

def test_nodes_file(self):
self.string.run()
node_file = os.path.join(self.string_output_dir, "nodes.tsv")
self.assertTrue(os.path.isfile(node_file))
node_df = pd.read_csv(node_file, sep="\t", header=0)
self.assertEqual((10, 7), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
'provided_by'], list(node_df.columns))
self.assertCountEqual(['ENSEMBL:ENSP00000000233',
'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
'ENSEMBL:ENSP00000371253'],
list(node_df.id.unique()))
self.assertCountEqual('UniprotKB:P84085',
node_df.loc[node_df['id'] ==
'ENSEMBL:ENSP00000000233'].xrefs.item())

def test_edges_file(self):
self.string.run()
edge_file = os.path.join(self.string_output_dir, "edges.tsv")
self.assertTrue(os.path.isfile(edge_file))
edge_df = pd.read_csv(edge_file, sep="\t", header=0)
self.assertEqual((9, 19), edge_df.shape)
self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
'combined_score', 'neighborhood', 'neighborhood_transferred',
'fusion', 'cooccurence', 'homology', 'coexpression',
'coexpression_transferred', 'experiments',
'experiments_transferred', 'database', 'database_transferred',
'textmining', 'textmining_transferred', ],
list(edge_df.columns))

0 comments on commit e6d1f87

Please sign in to comment.