Skip to content

Commit

Permalink
Merge 333026a into 1a0a7ba
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Jun 27, 2020
2 parents 1a0a7ba + 333026a commit 688cbca
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
2 changes: 1 addition & 1 deletion download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
-
# Gene Product information for SARS-CoV-2 genes in GPI format
# http://geneontology.org/docs/gene-product-information-gpi-format/
url: ftp://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_sars-cov-2.gpi
url: https://raw.githubusercontent.com/Knowledge-Graph-Hub/kg-covid-19/master/curated/ORFs/uniprot_sars-cov-2.gpi
local_name: uniprot_sars-cov-2.gpi

#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from typing import Generator, TextIO, List, Optional

from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound
from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound, guess_bl_category

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils import write_node_edge_item
Expand Down Expand Up @@ -54,15 +54,27 @@ def run(self, data_file: str = None):
# write headers
node.write("\t".join(self.node_header) + "\n")
edge.write("\t".join(self.edge_header) + "\n")

seen = set()
with open(gpi_file, 'r') as gpi_fh:
for rec in _gpi12iterator(gpi_fh):
node_data = self.gpi_to_gene_node_data(rec)
seen.add(node_data[0])
write_node_edge_item(node, self.node_header, node_data)

with open(gpa_file, 'r') as gpa_fh:
for rec in _gpa11iterator(gpa_fh):
edge_data = self.gpa_to_edge_data(rec)
subject_node = edge_data[0]
if subject_node not in seen:
subject_node_data = [subject_node, guess_bl_category(subject_node)] + [""] * 4 + [self.source_name]
write_node_edge_item(node, self.node_header, subject_node_data)
seen.add(subject_node)
object_node = edge_data[2]
if object_node not in seen:
object_node_data = [object_node, guess_bl_category(object_node)] + [""] * 4 + [self.source_name]
write_node_edge_item(node, self.node_header, object_node_data)
seen.add(object_node)

write_node_edge_item(edge, self.edge_header, edge_data)

def gpa_to_edge_data(self, rec: dict) -> list:
Expand Down
22 changes: 22 additions & 0 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,25 @@ def parse_header(header_string: str, sep: str = '\t') -> List:
def unzip_to_tempdir(zip_file_name: str, tempdir: str) -> None:
with zipfile.ZipFile(zip_file_name, 'r') as z:
z.extractall(tempdir)


def guess_bl_category(identifier: str) -> str:
"""Guess category for a given identifier.
Note: This is a temporary solution and should not be used long term.
Args:
identifier: A CURIE
Returns:
The category for the given CURIE
"""
prefix = identifier.split(':')[0]
if prefix in {'UniProtKB', 'ComplexPortal'}:
category = 'biolink:Protein'
elif prefix in {'GO'}:
category = 'biolink:OntologyClass'
else:
category = 'biolink:NamedThing'
return category
17 changes: 17 additions & 0 deletions tests/test_transform_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import unittest
from parameterized import parameterized
from kg_covid_19.utils.transform_utils import guess_bl_category


class TestTransformUtils(unittest.TestCase):
@parameterized.expand([
['', 'biolink:NamedThing'],
['UniProtKB', 'biolink:Protein'],
['ComplexPortal', 'biolink:Protein'],
['GO', 'biolink:OntologyClass'],
])
def test_guess_bl_category(self, curie, category):
self.assertEqual(category, guess_bl_category(curie))



0 comments on commit 688cbca

Please sign in to comment.