From 459b395ee379bb0a7a7fcb76d4786e1560a1137a Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Mon, 13 Jul 2020 10:31:40 -0400 Subject: [PATCH 1/3] First pass at go-cam ingest --- download.yaml | 8 ++++ kg_covid_19/transform.py | 1 + .../ontology/ontology_transform.py | 43 +++++++++++++++---- kg_covid_19/utils/transform_utils.py | 13 ++++++ 4 files changed, 57 insertions(+), 8 deletions(-) diff --git a/download.yaml b/download.yaml index 72d4874a..02376dc2 100644 --- a/download.yaml +++ b/download.yaml @@ -140,3 +140,11 @@ - url: http://current.geneontology.org/ontology/extensions/go-plus.json local_name: go-plus.json + +# +# GO-CAM +# + +- + url: https://github.com/justaddcoffee/go-cam-models-kg-covid-19/raw/master/lifted-go-cams-20200619.xml.gz + local_name: go-cams.rdf.gz diff --git a/kg_covid_19/transform.py b/kg_covid_19/transform.py index c9b69f71..56b24127 100644 --- a/kg_covid_19/transform.py +++ b/kg_covid_19/transform.py @@ -29,6 +29,7 @@ 'GoTransform': OntologyTransform, 'HpTransform': OntologyTransform, 'MondoTransform': OntologyTransform, + 'GoCam': OntologyTransform, 'ChemblTransform': ChemblTransform } diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index 949048f1..fb90d008 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -1,16 +1,22 @@ import os import logging +import tempfile from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer,\ + RdfOwlTransformer # type: ignore + +from kg_covid_19.utils.transform_utils import ungzip_to_tempdir ONTOLOGIES = { 'HpTransform': 'hp.json', 'GoTransform': 'go-plus.json', - 'MondoTransform': 'mondo.json' + 'MondoTransform': 'mondo.json', + 'GoCam': 'go-cams.rdf.gz' } + class OntologyTransform(Transform): """ OntologyTransform parses an Obograph JSON form of an Ontology into nodes nad edges. @@ -19,7 +25,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None): source_name = "ontologies" super().__init__(source_name, input_dir, output_dir) - def run(self, data_file: Optional[str] = None) -> None: + def run(self, data_file: Optional[str] = None, input_format = 'json') -> None: """Method is called and performs needed transformations to process an ontology. @@ -32,28 +38,49 @@ def run(self, data_file: Optional[str] = None) -> None: """ if data_file: k = data_file.split('.')[0] + + if data_file.endswith('.gz'): + input_format = 'rdf/xml' + xml_tempdir = tempfile.mkdtemp() + data_file = ungzip_to_tempdir(os.path.join( + self.input_base_dir, data_file), xml_tempdir) + data_file = os.path.join(self.input_base_dir, data_file) - self.parse(k, data_file, k) + self.parse(k, data_file, k, input_format=input_format) else: # load all ontologies for k in ONTOLOGIES.keys(): + + if data_file.endswith('.gz'): + input_format = 'rdf/xml' + xml_tempdir = tempfile.mkdtemp() + data_file = ungzip_to_tempdir(os.path.join( + self.input_base_dir, data_file), xml_tempdir) + data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) - self.parse(k, data_file, k) + self.parse(k, data_file, k, input_format=input_format) - def parse(self, name: str, data_file: str, source: str) -> None: + def parse(self, name: str, data_file: str, source: str, + input_format: str = 'json') -> None: """Processes the data_file. Args: name: Name of the ontology data_file: data file to parse - source: Source name + source: source name + input_format: format of input file Returns: None. """ logging.info(f"Parsing {data_file}") - transformer = ObographJsonTransformer() + if input_format == 'json': + transformer = ObographJsonTransformer() + elif input_format == 'rdf/xml': + transformer = RdfOwlTransformer() + else: + raise ValueError("No way to parse format {}" % format) transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) diff --git a/kg_covid_19/utils/transform_utils.py b/kg_covid_19/utils/transform_utils.py index 00e670aa..697f86d6 100644 --- a/kg_covid_19/utils/transform_utils.py +++ b/kg_covid_19/utils/transform_utils.py @@ -2,7 +2,10 @@ # -*- coding: utf-8 -*- import gzip import logging +import os import re +import shutil +import tempfile import zipfile from typing import Any, Dict, List, Union from tqdm import tqdm # type: ignore @@ -157,6 +160,16 @@ def unzip_to_tempdir(zip_file_name: str, tempdir: str) -> None: z.extractall(tempdir) +def ungzip_to_tempdir(gzipped_file: str, tempdir: str) -> str: + ungzipped_file = os.path.join(tempdir, os.path.basename(gzipped_file)) + if ungzipped_file.endswith('.gz'): + ungzipped_file = os.path.splitext(ungzipped_file)[0] + + with gzip.open(gzipped_file, 'rb') as f_in, open(ungzipped_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + return ungzipped_file + + def guess_bl_category(identifier: str) -> str: """Guess category for a given identifier. From 6dc1e703b4ec12f5ffe2b34e73dfaf39c8e737a4 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Mon, 13 Jul 2020 13:34:53 -0400 Subject: [PATCH 2/3] mypy --- .../transform_utils/ontology/ontology_transform.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index fb90d008..e45e33a1 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -4,8 +4,7 @@ from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer,\ - RdfOwlTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer, RdfOwlTransformer # type: ignore from kg_covid_19.utils.transform_utils import ungzip_to_tempdir @@ -51,13 +50,14 @@ def run(self, data_file: Optional[str] = None, input_format = 'json') -> None: # load all ontologies for k in ONTOLOGIES.keys(): + data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) + if data_file.endswith('.gz'): input_format = 'rdf/xml' xml_tempdir = tempfile.mkdtemp() data_file = ungzip_to_tempdir(os.path.join( self.input_base_dir, data_file), xml_tempdir) - data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k]) self.parse(k, data_file, k, input_format=input_format) def parse(self, name: str, data_file: str, source: str, @@ -80,7 +80,7 @@ def parse(self, name: str, data_file: str, source: str, elif input_format == 'rdf/xml': transformer = RdfOwlTransformer() else: - raise ValueError("No way to parse format {}" % format) + raise ValueError("No way to parse format %s" % format) transformer.parse(data_file, provided_by=source) output_transformer = PandasTransformer(transformer.graph) output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None) From 432804ff1a7ad147e53715b56bec4b5806539cd2 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 15 Jul 2020 12:07:49 -0400 Subject: [PATCH 3/3] Change transformer for go-cams back to RdfTransformer() - previously tried RdfOwlTransformer() to see what would happen --- kg_covid_19/transform_utils/ontology/ontology_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kg_covid_19/transform_utils/ontology/ontology_transform.py b/kg_covid_19/transform_utils/ontology/ontology_transform.py index e45e33a1..1d04f16e 100644 --- a/kg_covid_19/transform_utils/ontology/ontology_transform.py +++ b/kg_covid_19/transform_utils/ontology/ontology_transform.py @@ -4,7 +4,7 @@ from typing import Optional from kg_covid_19.transform_utils.transform import Transform -from kgx import PandasTransformer, ObographJsonTransformer, RdfOwlTransformer # type: ignore +from kgx import PandasTransformer, ObographJsonTransformer, RdfTransformer # type: ignore from kg_covid_19.utils.transform_utils import ungzip_to_tempdir @@ -78,7 +78,7 @@ def parse(self, name: str, data_file: str, source: str, if input_format == 'json': transformer = ObographJsonTransformer() elif input_format == 'rdf/xml': - transformer = RdfOwlTransformer() + transformer = RdfTransformer() else: raise ValueError("No way to parse format %s" % format) transformer.parse(data_file, provided_by=source)