Skip to content

Commit

Permalink
Merge pull request #231 from Knowledge-Graph-Hub/fix_sars_cov2_names_…
Browse files Browse the repository at this point in the history
…symbols

Fix names and symbols for SARS-CoV-2 gene annotation ingest
  • Loading branch information
deepakunni3 committed Jun 16, 2020
2 parents 0972358 + dd5ed4b commit c1fac28
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(self, input_dir: Optional[str] = None, output_dir: str = None):
source_name = "sars_cov_2_gene_annot"
super().__init__(source_name, input_dir, output_dir)

self.node_header = ['id', 'name', 'category', 'synonym', 'in_taxon', 'provided_by']
self.node_header = ['id', 'name', 'category', 'full_name', 'synonym',
'in_taxon', 'provided_by']
self.edge_header = ['subject', 'edge_label', 'object', 'relation',
'provided_by', 'DB_References', 'ECO_code', 'With',
'Interacting_taxon_ID',
Expand Down Expand Up @@ -111,26 +112,41 @@ def gpi_to_gene_node_data(self, rec: dict) -> list:
:param rec: record from gpi iterator
:return: list of node items, one for each thing in self.node_header
"""
# ['id', 'name', 'category', 'synonym', 'in_taxon']
# ['id', 'name', 'category', 'full_name', 'synonym', 'in_taxon', 'provided_by']
id: str = self._rec_to_id(rec)

try:
name_list = get_item_by_priority(rec, ['DB_Object_Name'])
if name_list is not None and len(name_list) > 0:
name = name_list[0]
full_name = name_list[0]
if len(name_list) > 1:
logging.warning(
"Found >1 DB_Object_Name in rec, using the first one")
else:
full_name = ''
except (IndexError, ItemInDictNotFound):
full_name = ''

try:
symbol_list = get_item_by_priority(rec, ['DB_Object_Symbol'])
if symbol_list is not None and len(symbol_list) > 0:
name = symbol_list[0]
if len(symbol_list) > 1:
logging.warning(
"Found >1 DB_Object_Symbol in rec, using the first one")
else:
name = ''
except (IndexError, ItemInDictNotFound):
name = ''
full_name = ''

category = self.protein_node_type
try:
synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])[0]
synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])
except (IndexError, ItemInDictNotFound):
synonym = ''
taxon = get_item_by_priority(rec, ['Taxon'])
taxon = ":".join([self.ncbi_taxon_prefix, taxon.split(":")[1]])
return [id, name, category, synonym, taxon, self.source_name]
return [id, name, category, full_name, synonym, taxon, self.source_name]


def _gpi12iterator(handle: TextIO) -> Generator:
Expand Down
3 changes: 3 additions & 0 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any, Dict, List, Union, TextIO
from tqdm import tqdm # type: ignore


class TransformError(Exception):
"""Base class for other exceptions"""
pass
Expand Down Expand Up @@ -102,6 +103,7 @@ def get_item_by_priority(items_dict: dict, keys_by_priority: list) -> str:
raise ItemInDictNotFound("Can't find item in items_dict {}".format(items_dict))
return value


def data_to_dict(these_keys, these_values) -> dict:
"""Zip up two lists to make a dict
Expand All @@ -111,6 +113,7 @@ def data_to_dict(these_keys, these_values) -> dict:
"""
return dict(zip(these_keys, these_values))


def uniprot_make_name_to_id_mapping(dat_gz_file: str) -> dict:
"""Given a Uniprot dat.gz file, like this:
ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
Expand Down
13 changes: 9 additions & 4 deletions tests/test_sars_cov_2_gene_annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
_gpi12iterator, _gpa11iterator


class TestPharmGKB(TestCase):
"""Tests the ttd transform"""
class TestSarsGeneAnnot(TestCase):
"""Tests the SARS-CoV-2 gene annotation transform"""

def setUp(self) -> None:
self.sc2ga = SARSCoV2GeneAnnot()
Expand Down Expand Up @@ -43,8 +43,13 @@ def test_gpi_to_gene_node(self):
item = next(gpi_iter)
node = self.sc2ga.gpi_to_gene_node_data(item)
self.assertEqual(len(self.sc2ga.node_header), len(node))
self.assertEqual(node,
['UniProtKB:P0DTD2', 'Protein 9b', 'biolink:Protein', '', 'NCBITaxon:2697049', 'sars_cov_2_gene_annot'])
self.assertEqual(node, ['UniProtKB:P0DTD2',
'P0DTD2',
'biolink:Protein',
'Protein 9b',
'',
'NCBITaxon:2697049',
'sars_cov_2_gene_annot'])

def test_gpa_to_edge_data(self):
gpa_iter = _gpa11iterator(self.gpa_fh)
Expand Down

0 comments on commit c1fac28

Please sign in to comment.