Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make xrefs between Ensembl IDs and Uniprot IDs in STRING ingest #237

Merged
merged 10 commits into from
Jun 18, 2020
Merged
7 changes: 7 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
local_name: gene_info.gz

-
# this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
# to be ID mapped in merge step
# nb: we are also downloading and using this file for the TTD transform
url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
local_name: HUMAN_9606_idmapping.dat.gz

#
# TTD - Therapeutic Targets Database
# drug targets, and associated data for each (drugs, ids, etc)
Expand Down
3 changes: 2 additions & 1 deletion kg_covid_19/transform_utils/string_ppi/node_header.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"category",
"description",
"alias",
"xrefs",
"provided_by"
]
]
67 changes: 48 additions & 19 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import gzip
import logging
import os
import compress_json # type: ignore
from typing import Dict, List, Any, Set, Optional
from typing import Dict, List, Any, Set, Optional, IO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority

from encodeproject import download as encode_download # type: ignore
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, uniprot_make_name_to_id_mapping

"""
Ingest protein-protein interactions from STRING DB.
Expand All @@ -18,18 +18,25 @@
Write node and edge headers that look something like:

Node:
id name category
protein:1234 TBX4 Protein
id name category xrefs provided_by
protein:1234 TBX4 biolink:Protein UniprotKB:123456 STRING

xrefs contains the UniprotKB id for the protein, if available

Edge:
subject edge_label object relation
protein:1234 interacts_with protein:4567 RO:0002434


"""

NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
GENE_INFO_FILE = 'gene_info.gz'

# make name to id map for uniprot names of human proteins
UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"


class StringTransform(Transform):
"""
Expand All @@ -43,7 +50,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.protein_gene_map: Dict[str, Any] = {}
self.gene_info_map: Dict[str, Any] = {}
self.ensembl2ncbi_map: Dict[str, Any] = {}
logging.info("Loading Ensembl Gene to Protein mapping")
self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
logging.info("Load mappings from NCBI gene_info")
self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])

def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
Expand Down Expand Up @@ -141,7 +150,11 @@ def run(self, data_file: Optional[str] = None) -> None:

# Required to align the node edge header of the gene
# with the default header
extra_header = [""]*(len(edge_additional_headers)+1)
self.extra_header = [""]*(len(edge_additional_headers)+1)

# make string ENSP to Uniprot id mapping dict
string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
Expand All @@ -155,9 +168,10 @@ def run(self, data_file: Optional[str] = None) -> None:
items_dict = parse_stringdb_interactions(line, header_items)
proteins = []
for protein_name in ('protein1', 'protein2'):
protein = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(protein.split('.')[1:])
nat_string_id = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(nat_string_id.split('.')[1:])
proteins.append(protein)

if protein in self.protein_gene_map:
gene = self.protein_gene_map[protein]
if gene not in seen_genes:
Expand All @@ -173,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None:
'biolink:Gene',
gene_informations['description'],
f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
"",
self.source_name
]
)
Expand All @@ -185,18 +200,31 @@ def run(self, data_file: Optional[str] = None) -> None:
f"ENSEMBL:{protein}",
"RO:0002205",
"NCBI",
] + extra_header
] + self.extra_header
)

# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)
write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type, "", "", self.source_name]
)
# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)

# if we have an equivalent Uniprot ID for this Ensembl protein
# ID make an xref edge, and a node for the Uniprot ID
uniprot_curie = ''
if protein in string_to_uniprot_id_map:
uniprot_curie = \
f"UniprotKB:{string_to_uniprot_id_map[protein]}"

write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type,
"",
"",
uniprot_curie, # xref
self.source_name]
)


# write edge data
write_node_edge_item(
Expand Down Expand Up @@ -240,3 +268,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
header = header_string.strip().split(sep)

return [i.replace('"', '') for i in header]

Binary file not shown.
Binary file added tests/resources/string/HUMAN_9606_idmapping.dat.gz
Binary file not shown.
Binary file added tests/resources/string/gene2ensembl.gz
Binary file not shown.
Binary file added tests/resources/string/gene_info.gz
Binary file not shown.
85 changes: 85 additions & 0 deletions tests/test_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import tempfile
import pandas as pd
from unittest import TestCase, skip

from parameterized import parameterized

from kg_covid_19.transform_utils.string_ppi import StringTransform


class TestString(TestCase):
"""Tests the string ingest"""
def setUp(self) -> None:
self.input_dir = "tests/resources/string/"
self.output_dir = tempfile.gettempdir()
self.string_output_dir = os.path.join(self.output_dir, "STRING")
self.string = StringTransform(self.input_dir, self.output_dir)

@parameterized.expand([
['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
['gene_info_map', dict, '1',
{'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
'description': 'alpha-1-B glycoprotein'}],
['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
])
def test_instance_vars(self, variable, type, key, val):
this_var = getattr(self.string, variable)
self.assertTrue(isinstance(this_var, type))
self.assertTrue(key in this_var)
self.assertTrue(this_var[key], val)

def test_output_dir(self):
self.assertEqual(self.string.output_dir, self.string_output_dir)

def test_input_dir(self):
self.assertEqual(self.string.input_base_dir, self.input_dir)

def test_output_edge_file(self):
self.assertEqual(self.string.output_edge_file,
os.path.join(self.string_output_dir, "edges.tsv"))

def test_output_node_file(self):
self.assertEqual(self.string.output_node_file,
os.path.join(self.string_output_dir, "nodes.tsv"))

def test_source_name(self):
self.assertEqual(self.string.source_name, 'STRING')

def test_run(self):
self.assertTrue(isinstance(self.string.run, object))
self.string.run()
self.assertTrue(os.path.isdir(self.string_output_dir))

def test_nodes_file(self):
self.string.run()
node_file = os.path.join(self.string_output_dir, "nodes.tsv")
self.assertTrue(os.path.isfile(node_file))
node_df = pd.read_csv(node_file, sep="\t", header=0)
self.assertEqual((10, 7), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
'provided_by'], list(node_df.columns))
self.assertCountEqual(['ENSEMBL:ENSP00000000233',
'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
'ENSEMBL:ENSP00000371253'],
list(node_df.id.unique()))
self.assertCountEqual('UniprotKB:P84085',
node_df.loc[node_df['id'] ==
'ENSEMBL:ENSP00000000233'].xrefs.item())

def test_edges_file(self):
self.string.run()
edge_file = os.path.join(self.string_output_dir, "edges.tsv")
self.assertTrue(os.path.isfile(edge_file))
edge_df = pd.read_csv(edge_file, sep="\t", header=0)
self.assertEqual((9, 19), edge_df.shape)
self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
'combined_score', 'neighborhood', 'neighborhood_transferred',
'fusion', 'cooccurence', 'homology', 'coexpression',
'coexpression_transferred', 'experiments',
'experiments_transferred', 'database', 'database_transferred',
'textmining', 'textmining_transferred', ],
list(edge_df.columns))