From 459b395ee379bb0a7a7fcb76d4786e1560a1137a Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Mon, 13 Jul 2020 10:31:40 -0400 Subject: [PATCH 1/7] First pass at go-cam ingest --- download.yaml | 8 ++++ kg_covid_19/transform.py | 1 + .../ontology/ontology_transform.py | 43 +++++++++++++++---- kg_covid_19/utils/transform_utils.py | 13 ++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/download.yaml b/download.yaml index 72d4874a..02376dc2 100644 --- a/download.yaml +++ b/download.yaml @@ -140,3 +140,11 @@ - url: http://current.geneontology.org/ontology/extensions/go-plus.json local_name: go-plus.json + +# +# GO-CAM +# + +- + url: https://github.com/justaddcoffee/go-cam-models-kg-covid-19/raw/master/lifted-go-cams-20200619.xml.gz + local_name: go-cams.rdf.gz diff --git a/kg_covid_19/transform.py b/kg_covid_19/transform.py index c9b69f71..56b24127 100644 --- a/kg_covid_19/transform.py +++ b/kg_covid_19/transform.py @@ -29,6 +29,7 @@ 'GoTransform': OntologyTransform, 'HpTransform': OntologyTransform, 'MondoTransform': OntologyTransform, + 'GoCam': OntologyTransform, 'ChemblTransform': ChemblTransform } diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index 949048f1..fb90d008 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -1,16 +1,22 @@ import os import logging +import tempfile from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer,\ + RdfOwlTransformer # type: ignore + +from kg_covid_19.utils.transform_utils import ungzip_to_tempdir ONTOLOGIES = { 'HpTransform': 'hp.json', 'GoTransform': 'go-plus.json', - 'MondoTransform': 'mondo.json' + 'MondoTransform': 'mondo.json', + 'GoCam': 'go-cams.rdf.gz' } + class OntologyTransform(Transform): """ OntologyTransform parses an Obograph JSON form of an Ontology into nodes nad edges. @@ -19,7 +25,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None): source_name = "ontologies" super().__init__(source_name, input_dir, output_dir) - def run(self, data_file: Optional[str] = None) -> None: + def run(self, data_file: Optional[str] = None, input_format = 'json') -> None: """Method is called and performs needed transformations to process an ontology. @@ -32,28 +38,49 @@ def run(self, data_file: Optional[str] = None) -> None: """ if data_file: k = data_file.split('.')[0] + + if data_file.endswith('.gz'): + input_format = 'rdf/xml' + xml_tempdir = tempfile.mkdtemp() + data_file = ungzip_to_tempdir(os.path.join( + self.input_base_dir, data_file), xml_tempdir) + data_file = os.path.join(self.input_base_dir, data_file) - self.parse(k, data_file, k) + self.parse(k, data_file, k, input_format=input_format) else: # load all ontologies for k in ONTOLOGIES.keys(): + + if data_file.endswith('.gz'): + input_format = 'rdf/xml' + xml_tempdir = tempfile.mkdtemp() + data_file = ungzip_to_tempdir(os.path.join( + self.input_base_dir, data_file), xml_tempdir) + data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) - self.parse(k, data_file, k) + self.parse(k, data_file, k, input_format=input_format) - def parse(self, name: str, data_file: str, source: str) -> None: + def parse(self, name: str, data_file: str, source: str, + input_format: str = 'json') -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse - source: Source name + source: source name + input_format: format of input file Returns: None. """ logging.info(f"Parsing {data_file}") - transformer = ObographJsonTransformer() + if input_format == 'json': + transformer = ObographJsonTransformer() + elif input_format == 'rdf/xml': + transformer = RdfOwlTransformer() + else: + raise ValueError("No way to parse format {}" % format) transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) diff --git a/kg_covid_19/utils/transform_utils.py b/kg_covid_19/utils/transform_utils.py index 00e670aa..697f86d6 100644 --- a/kg_covid_19/utils/transform_utils.py +++ b/kg_covid_19/utils/transform_utils.py @@ -2,7 +2,10 @@ # -*- coding: utf-8 -*- import gzip import logging +import os import re +import shutil +import tempfile import zipfile from typing import Any, Dict, List, Union from tqdm import tqdm # type: ignore @@ -157,6 +160,16 @@ def unzip_to_tempdir(zip_file_name: str, tempdir: str) -> None: z.extractall(tempdir) +def ungzip_to_tempdir(gzipped_file: str, tempdir: str) -> str: + ungzipped_file = os.path.join(tempdir, os.path.basename(gzipped_file)) + if ungzipped_file.endswith('.gz'): + ungzipped_file = os.path.splitext(ungzipped_file)[0] + + with gzip.open(gzipped_file, 'rb') as f_in, open(ungzipped_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + return ungzipped_file + + def guess_bl_category(identifier: str) -> str: """Guess category for a given identifier. From 6dc1e703b4ec12f5ffe2b34e73dfaf39c8e737a4 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Mon, 13 Jul 2020 13:34:53 -0400 Subject: [PATCH 2/7] mypy --- .../transform_utils/ontology/ontology_transform.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index fb90d008..e45e33a1 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -4,8 +4,7 @@ from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer,\ - RdfOwlTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer, RdfOwlTransformer # type: ignore from kg_covid_19.utils.transform_utils import ungzip_to_tempdir @@ -51,13 +50,14 @@ def run(self, data_file: Optional[str] = None, input_format = 'json') -> None: # load all ontologies for k in ONTOLOGIES.keys(): + data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) + if data_file.endswith('.gz'): input_format = 'rdf/xml' xml_tempdir = tempfile.mkdtemp() data_file = ungzip_to_tempdir(os.path.join( self.input_base_dir, data_file), xml_tempdir) - data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) self.parse(k, data_file, k, input_format=input_format) def parse(self, name: str, data_file: str, source: str, @@ -80,7 +80,7 @@ def parse(self, name: str, data_file: str, source: str, elif input_format == 'rdf/xml': transformer = RdfOwlTransformer() else: - raise ValueError("No way to parse format {}" % format) + raise ValueError("No way to parse format %s" % format) transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) From 432804ff1a7ad147e53715b56bec4b5806539cd2 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 15 Jul 2020 12:07:49 -0400 Subject: [PATCH 3/7] Change transformer for go-cams back to RdfTransformer() - previously tried RdfOwlTransformer() to see what would happen --- kg_covid_19/transform_utils/ontology/ontology_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index e45e33a1..1d04f16e 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -4,7 +4,7 @@ from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer, RdfOwlTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer, RdfTransformer # type: ignore from kg_covid_19.utils.transform_utils import ungzip_to_tempdir @@ -78,7 +78,7 @@ def parse(self, name: str, data_file: str, source: str, if input_format == 'json': transformer = ObographJsonTransformer() elif input_format == 'rdf/xml': - transformer = RdfOwlTransformer() + transformer = RdfTransformer() else: raise ValueError("No way to parse format %s" % format) transformer.parse(data_file, provided_by=source) From 468aab77da8195c44902eee89f0630f92c427d64 Mon Sep 17 00:00:00 2001 From: Deepak Unni Date: Thu, 6 Aug 2020 17:07:19 -0700 Subject: [PATCH 4/7] Restore OntologyTransform --- .../ontology/ontology_transform.py | 49 ++++--------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index 1d04f16e..7b304575 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -1,18 +1,15 @@ import os -import logging -import tempfile + from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer, RdfTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer # type: ignore -from kg_covid_19.utils.transform_utils import ungzip_to_tempdir ONTOLOGIES = { 'HpTransform': 'hp.json', 'GoTransform': 'go-plus.json', 'MondoTransform': 'mondo.json', - 'GoCam': 'go-cams.rdf.gz' } @@ -24,63 +21,35 @@ def __init__(self, input_dir: str = None, output_dir: str = None): source_name = "ontologies" super().__init__(source_name, input_dir, output_dir) - def run(self, data_file: Optional[str] = None, input_format = 'json') -> None: + def run(self, data_file: Optional[str] = None) -> None: """Method is called and performs needed transformations to process an ontology. - Args: data_file: data file to parse - Returns: None. - """ if data_file: k = data_file.split('.')[0] - - if data_file.endswith('.gz'): - input_format = 'rdf/xml' - xml_tempdir = tempfile.mkdtemp() - data_file = ungzip_to_tempdir(os.path.join( - self.input_base_dir, data_file), xml_tempdir) - data_file = os.path.join(self.input_base_dir, data_file) - self.parse(k, data_file, k, input_format=input_format) + self.parse(k, data_file, k) else: # load all ontologies for k in ONTOLOGIES.keys(): - data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) + self.parse(k, data_file, k) - if data_file.endswith('.gz'): - input_format = 'rdf/xml' - xml_tempdir = tempfile.mkdtemp() - data_file = ungzip_to_tempdir(os.path.join( - self.input_base_dir, data_file), xml_tempdir) - - self.parse(k, data_file, k, input_format=input_format) - - def parse(self, name: str, data_file: str, source: str, - input_format: str = 'json') -> None: + def parse(self, name: str, data_file: str, source: str) -> None: """Processes the data_file. - Args: name: Name of the ontology data_file: data file to parse - source: source name - input_format: format of input file - + source: Source name Returns: None. - """ logging.info(f"Parsing {data_file}") - if input_format == 'json': - transformer = ObographJsonTransformer() - elif input_format == 'rdf/xml': - transformer = RdfTransformer() - else: - raise ValueError("No way to parse format %s" % format) + transformer = ObographJsonTransformer() transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) - output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) + output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) \ No newline at end of file From ba3a5dd741cb7a529a50da254a4a9d4a8657049b Mon Sep 17 00:00:00 2001 From: Deepak Unni Date: Thu, 6 Aug 2020 17:11:15 -0700 Subject: [PATCH 5/7] Add GocamTransform for parsing lifted GO-CAM models --- download.yaml | 2 +- kg_covid_19/transform.py | 5 +- .../gocam_transform/__init__.py | 5 + .../gocam_transform/gocam_transform.py | 101 ++++++++++++++++++ merge.yaml | 5 + 5 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 kg_covid_19/transform_utils/gocam_transform/__init__.py create mode 100644 kg_covid_19/transform_utils/gocam_transform/gocam_transform.py diff --git a/download.yaml b/download.yaml index 02376dc2..3fb46254 100644 --- a/download.yaml +++ b/download.yaml @@ -147,4 +147,4 @@ - url: https://github.com/justaddcoffee/go-cam-models-kg-covid-19/raw/master/lifted-go-cams-20200619.xml.gz - local_name: go-cams.rdf.gz + local_name: lifted-go-cams-20200619.xml.gz diff --git a/kg_covid_19/transform.py b/kg_covid_19/transform.py index 56b24127..9c176d22 100644 --- a/kg_covid_19/transform.py +++ b/kg_covid_19/transform.py @@ -5,6 +5,7 @@ from kg_covid_19.transform_utils.chembl.chembl_transform import ChemblTransform from kg_covid_19.transform_utils.drug_central.drug_central import DrugCentralTransform +from kg_covid_19.transform_utils.gocam_transform.gocam_transform import GocamTransform from kg_covid_19.transform_utils.intact.intact import IntAct from kg_covid_19.transform_utils.ontology import OntologyTransform from kg_covid_19.transform_utils.ontology.ontology_transform import ONTOLOGIES @@ -22,14 +23,14 @@ 'DrugCentralTransform': DrugCentralTransform, 'TTDTransform': TTDTransform, 'StringTransform': StringTransform, - 'ScibiteCordTransform': ScibiteCordTransform, + # 'ScibiteCordTransform': ScibiteCordTransform, 'PharmGKB': PharmGKB, 'SARSCoV2GeneAnnot': SARSCoV2GeneAnnot, 'IntAct': IntAct, 'GoTransform': OntologyTransform, 'HpTransform': OntologyTransform, 'MondoTransform': OntologyTransform, - 'GoCam': OntologyTransform, + 'GocamTransform': GocamTransform, 'ChemblTransform': ChemblTransform } diff --git a/kg_covid_19/transform_utils/gocam_transform/__init__.py b/kg_covid_19/transform_utils/gocam_transform/__init__.py new file mode 100644 index 00000000..10319d53 --- /dev/null +++ b/kg_covid_19/transform_utils/gocam_transform/__init__.py @@ -0,0 +1,5 @@ +from .gocam_transform import GocamTransform + +__all__ = [ + "GocamTransform" +] diff --git a/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py new file mode 100644 index 00000000..fa258856 --- /dev/null +++ b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py @@ -0,0 +1,101 @@ +import gzip +import logging +import os +import shutil +from typing import Optional + +from kgx import RdfTransformer, PandasTransformer + +from kg_covid_19.transform_utils.transform import Transform + + +class GocamTransform(Transform): + """ + GocamTransform parses GO-CAMs that have been subjected to + RDF edge project (REP) pattern. + """ + def __init__(self, input_dir: str = None, output_dir: str = None): + source_name = "GOCAMs" + super().__init__(source_name, input_dir, output_dir) + + def run(self, data_file: Optional[str] = None, **kwargs) -> None: + """Method is called and performs needed transformations to process + an ontology. + + Args: + data_file: data file to parse + + Returns: + None. + + """ + if not data_file: + data_file = os.path.join(self.input_base_dir, 'lifted-go-cams-20200619.xml.gz') + + if data_file.endswith('.gz'): + print("Decompressing") + decompressed_data_file = '.'.join(data_file.split('.')[0:-1]) + self.decompress_file(data_file, decompressed_data_file) + else: + decompressed_data_file = data_file + + if 'input_format' in kwargs: + input_format = kwargs['input_format'] + if input_format not in {'nt', 'ttl', 'rdf/xml'}: + raise ValueError(f"Unsupported input_format: {input_format}") + else: + input_format = None + self.parse(decompressed_data_file, input_format) + + def parse(self, data_file: str, input_format: str) -> None: + """Processes the data_file. + + Args: + data_file: data file to parse + input_format: format of input file + + Returns: + None + + """ + # define prefix to IRI mappings + cmap = { + 'REACT': 'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_', + 'WB': 'http://identifiers.org/wormbase/', + 'FB': 'http://identifiers.org/flybase/', + 'LEGO': 'http://geneontology.org/lego/', + 'GOCAM': 'http://model.geneontology.org/', + 'TAIR.LOCUS': 'http://identifiers.org/tair.locus/', + 'POMBASE': 'http://identifiers.org/PomBase', + 'DICTYBASE.GENE': 'http://identifiers.org/dictybase.gene/', + 'XENBASE': 'http://identifiers.org/xenbase/' + } + + # define predicates that are to be treated as node properties + np = { + 'http://geneontology.org/lego/evidence', + 'https://w3id.org/biolink/vocab/subjectActivity', + 'https://w3id.org/biolink/vocab/objectActivity', + } + + logging.info(f"Parsing {data_file}") + transformer = RdfTransformer(curie_map=cmap) + transformer.parse(data_file, node_property_predicates=np, input_format=input_format) + output_transformer = PandasTransformer(transformer.graph) + output_transformer.save(os.path.join(self.output_dir, self.source_name), output_format='tsv', mode=None) + + def decompress_file(self, input_file: str, output_file: str): + """Decompress a file. + + Args: + input_file: Input file + output_file: Output file + + Returns: + str + + """ + FH = gzip.open(input_file, 'rb') + WH = open(output_file, 'wb') + WH.write(FH.read()) + return output_file diff --git a/merge.yaml b/merge.yaml index c927e395..b0245923 100644 --- a/merge.yaml +++ b/merge.yaml @@ -82,6 +82,11 @@ target: filename: - data/transformed/ontologies/hp_nodes.tsv - data/transformed/ontologies/hp_edges.tsv + go-cams: + type: tsv + filename: + - data/transformed/GOCAMs/GOCAMs_nodes.tsv + - data/transformed/GOCAMs/GOCAMs_edges.tsv destination: merged-kg-tsv: type: tsv From 786fa1f0d6f1d9a7ea25e82f5d35af486e40f10a Mon Sep 17 00:00:00 2001 From: Deepak Unni Date: Thu, 6 Aug 2020 17:46:39 -0700 Subject: [PATCH 6/7] Fix mypy --- kg_covid_19/transform_utils/gocam_transform/gocam_transform.py | 3 +-- kg_covid_19/transform_utils/ontology/ontology_transform.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py index fa258856..48ac6ddc 100644 --- a/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py +++ b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py @@ -1,5 +1,4 @@ import gzip -import logging import os import shutil from typing import Optional @@ -78,7 +77,7 @@ def parse(self, data_file: str, input_format: str) -> None: 'https://w3id.org/biolink/vocab/objectActivity', } - logging.info(f"Parsing {data_file}") + print(f"Parsing {data_file}") transformer = RdfTransformer(curie_map=cmap) transformer.parse(data_file, node_property_predicates=np, input_format=input_format) output_transformer = PandasTransformer(transformer.graph) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index 7b304575..2f1290f1 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -48,7 +48,7 @@ def parse(self, name: str, data_file: str, source: str) -> None: Returns: None. """ - logging.info(f"Parsing {data_file}") + print(f"Parsing {data_file}") transformer = ObographJsonTransformer() transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) From ff4b2cda41ffb1a135d832699c7eeeb776360b4e Mon Sep 17 00:00:00 2001 From: Deepak Unni Date: Thu, 6 Aug 2020 17:56:10 -0700 Subject: [PATCH 7/7] Fix mypy --- kg_covid_19/transform_utils/gocam_transform/gocam_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py index 48ac6ddc..39927216 100644 --- a/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py +++ b/kg_covid_19/transform_utils/gocam_transform/gocam_transform.py @@ -3,7 +3,7 @@ import shutil from typing import Optional -from kgx import RdfTransformer, PandasTransformer +from kgx import RdfTransformer, PandasTransformer # type: ignore from kg_covid_19.transform_utils.transform import Transform