Skip to content

Commit

Permalink
Merge e388719 into c1fac28
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee authored Jun 17, 2020
2 parents c1fac28 + e388719 commit ce16430
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 14 deletions.
7 changes: 7 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
local_name: gene_info.gz

-
# this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
# to be ID mapped in merge step
# nb: we are also downloading and using this file for the TTD transform
url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
local_name: HUMAN_9606_idmapping.dat.gz

#
# TTD - Therapeutic Targets Database
# drug targets, and associated data for each (drugs, ids, etc)
Expand Down
64 changes: 50 additions & 14 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import gzip
import logging
import os
import compress_json # type: ignore
from typing import Dict, List, Any, Set, Optional
from typing import Dict, List, Any, Set, Optional, IO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority

from encodeproject import download as encode_download # type: ignore
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, uniprot_make_name_to_id_mapping

"""
Ingest protein-protein interactions from STRING DB.
Expand All @@ -24,12 +24,20 @@
Edge:
subject edge_label object relation
protein:1234 interacts_with protein:4567 RO:0002434
Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
subject edge_label object relation
protein:1234 bl:xrefs protein:4567 RO:0002434
"""

NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
GENE_INFO_FILE = 'gene_info.gz'

# make name to id map for uniprot names of human proteins
UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"


class StringTransform(Transform):
"""
Expand All @@ -43,7 +51,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.protein_gene_map: Dict[str, Any] = {}
self.gene_info_map: Dict[str, Any] = {}
self.ensembl2ncbi_map: Dict[str, Any] = {}
logging.info("Loading Ensembl Gene to Protein mapping")
self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
logging.info("Load mappings from NCBI gene_info")
self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])

def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
Expand Down Expand Up @@ -141,7 +151,11 @@ def run(self, data_file: Optional[str] = None) -> None:

# Required to align the node edge header of the gene
# with the default header
extra_header = [""]*(len(edge_additional_headers)+1)
self.extra_header = [""]*(len(edge_additional_headers)+1)

# make string ENSP to Uniprot id mapping dict
string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
Expand All @@ -155,9 +169,10 @@ def run(self, data_file: Optional[str] = None) -> None:
items_dict = parse_stringdb_interactions(line, header_items)
proteins = []
for protein_name in ('protein1', 'protein2'):
protein = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(protein.split('.')[1:])
nat_string_id = get_item_by_priority(items_dict, [protein_name])
protein = '.'.join(nat_string_id.split('.')[1:])
proteins.append(protein)

if protein in self.protein_gene_map:
gene = self.protein_gene_map[protein]
if gene not in seen_genes:
Expand Down Expand Up @@ -185,18 +200,38 @@ def run(self, data_file: Optional[str] = None) -> None:
f"ENSEMBL:{protein}",
"RO:0002205",
"NCBI",
] + extra_header
] + self.extra_header
)

# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)
# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)
write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type, "", "", self.source_name]
)

# if we have an equivalent Uniprot ID for this Ensembl protein
# ID make an xref edge, and a node for the Uniprot ID
if protein in string_to_uniprot_id_map:
uniprot_curie = \
f"UniprotKB:{string_to_uniprot_id_map[protein]}"
write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type, "", "", self.source_name]
)
data=[uniprot_curie, "",
protein_node_type, "", "", self.source_name])
write_node_edge_item(
fh=edge,
header=self.edge_header,
data=[f"ENSEMBL:{protein}",
"biolink:xrefs",
uniprot_curie,
"biolink:xrefs",
"uniprot",
] + self.extra_header)

# write edge data
write_node_edge_item(
Expand Down Expand Up @@ -240,3 +275,4 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
header = header_string.strip().split(sep)

return [i.replace('"', '') for i in header]

Binary file not shown.
Binary file added tests/resources/string/HUMAN_9606_idmapping.dat.gz
Binary file not shown.
Binary file added tests/resources/string/gene2ensembl.gz
Binary file not shown.
Binary file added tests/resources/string/gene_info.gz
Binary file not shown.
82 changes: 82 additions & 0 deletions tests/test_string.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import tempfile
import pandas as pd
from unittest import TestCase, skip

from parameterized import parameterized

from kg_covid_19.transform_utils.string_ppi import StringTransform


class TestString(TestCase):
"""Tests the string ingest"""
def setUp(self) -> None:
self.input_dir = "tests/resources/string/"
self.output_dir = tempfile.gettempdir()
self.string_output_dir = os.path.join(self.output_dir, "STRING")
self.string = StringTransform(self.input_dir, self.output_dir)

@parameterized.expand([
['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
['gene_info_map', dict, '1',
{'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
'description': 'alpha-1-B glycoprotein'}],
['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
])
def test_instance_vars(self, variable, type, key, val):
this_var = getattr(self.string, variable)
self.assertTrue(isinstance(this_var, type))
self.assertTrue(key in this_var)
self.assertTrue(this_var[key], val)

def test_output_dir(self):
self.assertEqual(self.string.output_dir, self.string_output_dir)

def test_input_dir(self):
self.assertEqual(self.string.input_base_dir, self.input_dir)

def test_output_edge_file(self):
self.assertEqual(self.string.output_edge_file,
os.path.join(self.string_output_dir, "edges.tsv"))

def test_output_node_file(self):
self.assertEqual(self.string.output_node_file,
os.path.join(self.string_output_dir, "nodes.tsv"))

def test_source_name(self):
self.assertEqual(self.string.source_name, 'STRING')

def test_run(self):
self.assertTrue(isinstance(self.string.run, object))
self.string.run()
self.assertTrue(os.path.isdir(self.string_output_dir))

def test_nodes_file(self):
self.string.run()
node_file = os.path.join(self.string_output_dir, "nodes.tsv")
self.assertTrue(os.path.isfile(node_file))
node_df = pd.read_csv(node_file, sep="\t", header=0)
self.assertEqual((11, 6), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias',
'provided_by'], list(node_df.columns))
self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
'ENSEMBL:ENSP00000371253'],
list(node_df.id.unique()))

def test_edges_file(self):
self.string.run()
edge_file = os.path.join(self.string_output_dir, "edges.tsv")
self.assertTrue(os.path.isfile(edge_file))
edge_df = pd.read_csv(edge_file, sep="\t", header=0)
self.assertEqual((10, 19), edge_df.shape)
self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
'combined_score', 'neighborhood', 'neighborhood_transferred',
'fusion', 'cooccurence', 'homology', 'coexpression',
'coexpression_transferred', 'experiments',
'experiments_transferred', 'database', 'database_transferred',
'textmining', 'textmining_transferred', ],
list(edge_df.columns))

0 comments on commit ce16430

Please sign in to comment.