Skip to content

Commit

Permalink
Merge 4afc529 into c1fac28
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee authored Jun 17, 2020
2 parents c1fac28 + 4afc529 commit 4b0bd2f
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 8 deletions.
7 changes: 7 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
local_name: gene_info.gz

-
# this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
# to be ID mapped in merge step
# nb: we are also downloading and using this file for the TTD transform
url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
local_name: HUMAN_9606_idmapping.dat.gz

#
# TTD - Therapeutic Targets Database
# drug targets, and associated data for each (drugs, ids, etc)
Expand Down
53 changes: 45 additions & 8 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import gzip
import logging
import os
import compress_json # type: ignore
from typing import Dict, List, Any, Set, Optional
from typing import Dict, List, Any, Set, Optional, IO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority

from encodeproject import download as encode_download # type: ignore
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, uniprot_make_name_to_id_mapping

"""
Ingest protein-protein interactions from STRING DB.
Expand All @@ -24,12 +24,20 @@
Edge:
subject edge_label object relation
protein:1234 interacts_with protein:4567 RO:0002434
Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
subject edge_label object relation
protein:1234 bl:xrefs protein:4567 RO:0002434
"""

NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
GENE_INFO_FILE = 'gene_info.gz'

# make name to id map for uniprot names of human proteins
UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"


class StringTransform(Transform):
"""
Expand All @@ -43,7 +51,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.protein_gene_map: Dict[str, Any] = {}
self.gene_info_map: Dict[str, Any] = {}
self.ensembl2ncbi_map: Dict[str, Any] = {}
logging.info("Loading Ensembl Gene to Protein mapping")
self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
logging.info("Load mappings from NCBI gene_info")
self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])

def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
Expand Down Expand Up @@ -141,7 +151,11 @@ def run(self, data_file: Optional[str] = None) -> None:

# Required to align the node edge header of the gene
# with the default header
extra_header = [""]*(len(edge_additional_headers)+1)
self.extra_header = [""]*(len(edge_additional_headers)+1)

# make string ENSP to Uniprot id mapping dict
string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
Expand All @@ -155,9 +169,10 @@ def run(self, data_file: Optional[str] = None) -> None:
items_dict = parse_stringdb_interactions(line, header_items)
proteins = []
for protein_name in ('protein1', 'protein2'):
protein = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(protein.split('.')[1:])
nat_string_id = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(nat_string_id.split('.')[1:])
proteins.append(protein)

if protein in self.protein_gene_map:
gene = self.protein_gene_map[protein]
if gene not in seen_genes:
Expand Down Expand Up @@ -185,7 +200,7 @@ def run(self, data_file: Optional[str] = None) -> None:
f"ENSEMBL:{protein}",
"RO:0002205",
"NCBI",
] + extra_header
] + self.extra_header
)

# write node data
Expand All @@ -198,6 +213,26 @@ def run(self, data_file: Optional[str] = None) -> None:
protein_node_type, "", "", self.source_name]
)

# if we have an equivalent Uniprot ID for this Ensembl protein
# ID make an xref edge, and a node for the Uniprot ID
if nat_string_id in string_to_uniprot_id_map:
uniprot_curie = \
f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
write_node_edge_item(
fh=node,
header=self.node_header,
data=[uniprot_curie, "",
protein_node_type, "", "", self.source_name])
write_node_edge_item(
fh=edge,
header=self.edge_header,
data=[f"ENSEMBL:{protein}",
"biolink:xrefs",
uniprot_curie,
"biolink:xrefs",
"uniprot",
] + self.extra_header)

# write edge data
write_node_edge_item(
fh=edge,
Expand All @@ -210,6 +245,7 @@ def run(self, data_file: Optional[str] = None) -> None:
)



def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict:
"""Methods processes a line of text from Drug Central.
Expand Down Expand Up @@ -240,3 +276,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
header = header_string.strip().split(sep)

return [i.replace('"', '') for i in header]

Binary file not shown.
Binary file added tests/resources/string/HUMAN_9606_idmapping.dat.gz
Binary file not shown.
Binary file added tests/resources/string/gene2ensembl.gz
Binary file not shown.
Binary file added tests/resources/string/gene_info.gz
Binary file not shown.
59 changes: 59 additions & 0 deletions tests/test_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import tempfile
from unittest import TestCase, skip

from parameterized import parameterized

from kg_covid_19.transform_utils.string_ppi import StringTransform


class TestString(TestCase):
"""Tests the string ingest"""
@classmethod
def setUpClass(cls) -> None:
cls.input_dir = "tests/resources/string/"
cls.output_dir = tempfile.gettempdir()
cls.string_output_dir = os.path.join(cls.output_dir, "STRING")
cls.string = StringTransform(cls.input_dir, cls.output_dir)

def setUp(self) -> None:
pass

@parameterized.expand([
['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
['gene_info_map', dict, '1',
{'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
'description': 'alpha-1-B glycoprotein'}],
['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
])
def test_instance_vars(self, variable, type, key, val):
this_var = getattr(self.string, variable)
self.assertTrue(isinstance(this_var, type))
self.assertTrue(key in this_var)
self.assertTrue(this_var[key], val)

def test_output_dir(self):
self.assertEqual(self.string.output_dir, self.string_output_dir)

def test_input_dir(self):
self.assertEqual(self.string.input_base_dir, self.input_dir)

def test_output_edge_file(self):
self.assertEqual(self.string.output_edge_file,
os.path.join(self.string_output_dir, "edges.tsv"))

def test_output_node_file(self):
self.assertEqual(self.string.output_node_file,
os.path.join(self.string_output_dir, "nodes.tsv"))

def test_source_name(self):
self.assertEqual(self.string.source_name, 'STRING')

def test_run(self):
self.assertTrue(isinstance(self.string.run, object))
self.string.run()
self.assertTrue(os.path.isdir(self.string_output_dir))
self.assertTrue(
os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv")))
self.assertTrue(
os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv")))

0 comments on commit 4b0bd2f

Please sign in to comment.