diff --git a/download.yaml b/download.yaml index 0d8ea12f..5a7ceca1 100644 --- a/download.yaml +++ b/download.yaml @@ -114,7 +114,7 @@ - # Gene Product information for SARS-CoV-2 genes in GPI format # http://geneontology.org/docs/gene-product-information-gpi-format/ - url: ftp://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_sars-cov-2.gpi + url: https://raw.githubusercontent.com/Knowledge-Graph-Hub/kg-covid-19/master/curated/ORFs/uniprot_sars-cov-2.gpi local_name: uniprot_sars-cov-2.gpi # diff --git a/kg_covid_19/transform_utils/sars_cov_2_gene_annot/sars_cov_2_gene_annot.py b/kg_covid_19/transform_utils/sars_cov_2_gene_annot/sars_cov_2_gene_annot.py index 4505e94d..8aa13111 100644 --- a/kg_covid_19/transform_utils/sars_cov_2_gene_annot/sars_cov_2_gene_annot.py +++ b/kg_covid_19/transform_utils/sars_cov_2_gene_annot/sars_cov_2_gene_annot.py @@ -4,7 +4,7 @@ import os from typing import Generator, TextIO, List, Optional -from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound +from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound, guess_bl_category from kg_covid_19.transform_utils.transform import Transform from kg_covid_19.utils import write_node_edge_item @@ -54,15 +54,27 @@ def run(self, data_file: str = None): # write headers node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") - + seen = set() with open(gpi_file, 'r') as gpi_fh: for rec in _gpi12iterator(gpi_fh): node_data = self.gpi_to_gene_node_data(rec) + seen.add(node_data[0]) write_node_edge_item(node, self.node_header, node_data) with open(gpa_file, 'r') as gpa_fh: for rec in _gpa11iterator(gpa_fh): edge_data = self.gpa_to_edge_data(rec) + subject_node = edge_data[0] + if subject_node not in seen: + subject_node_data = [subject_node, guess_bl_category(subject_node)] + [""] * 4 + [self.source_name] + write_node_edge_item(node, self.node_header, subject_node_data) + seen.add(subject_node) + object_node = edge_data[2] + if object_node not in seen: + object_node_data = [object_node, guess_bl_category(object_node)] + [""] * 4 + [self.source_name] + write_node_edge_item(node, self.node_header, object_node_data) + seen.add(object_node) + write_node_edge_item(edge, self.edge_header, edge_data) def gpa_to_edge_data(self, rec: dict) -> list: diff --git a/kg_covid_19/utils/transform_utils.py b/kg_covid_19/utils/transform_utils.py index 70283cf6..a4deffad 100644 --- a/kg_covid_19/utils/transform_utils.py +++ b/kg_covid_19/utils/transform_utils.py @@ -154,3 +154,25 @@ def parse_header(header_string: str, sep: str = '\t') -> List: def unzip_to_tempdir(zip_file_name: str, tempdir: str) -> None: with zipfile.ZipFile(zip_file_name, 'r') as z: z.extractall(tempdir) + + +def guess_bl_category(identifier: str) -> str: + """Guess category for a given identifier. + + Note: This is a temporary solution and should not be used long term. + + Args: + identifier: A CURIE + + Returns: + The category for the given CURIE + + """ + prefix = identifier.split(':')[0] + if prefix in {'UniProtKB', 'ComplexPortal'}: + category = 'biolink:Protein' + elif prefix in {'GO'}: + category = 'biolink:OntologyClass' + else: + category = 'biolink:NamedThing' + return category diff --git a/tests/test_transform_utils.py b/tests/test_transform_utils.py new file mode 100644 index 00000000..25c038b7 --- /dev/null +++ b/tests/test_transform_utils.py @@ -0,0 +1,17 @@ +import unittest +from parameterized import parameterized +from kg_covid_19.utils.transform_utils import guess_bl_category + + +class TestTransformUtils(unittest.TestCase): + @parameterized.expand([ + ['', 'biolink:NamedThing'], + ['UniProtKB', 'biolink:Protein'], + ['ComplexPortal', 'biolink:Protein'], + ['GO', 'biolink:OntologyClass'], + ]) + def test_guess_bl_category(self, curie, category): + self.assertEqual(category, guess_bl_category(curie)) + + +