Skip to content

Commit

Permalink
Merge ff4b2cd into 1f91110
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Aug 7, 2020
2 parents 1f91110 + ff4b2cd commit 7a43ed2
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 12 deletions.
8 changes: 8 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,11 @@
-
url: http://current.geneontology.org/ontology/extensions/go-plus.json
local_name: go-plus.json

#
# GO-CAM
#

-
url: https://github.com/justaddcoffee/go-cam-models-kg-covid-19/raw/master/lifted-go-cams-20200619.xml.gz
local_name: lifted-go-cams-20200619.xml.gz
4 changes: 3 additions & 1 deletion kg_covid_19/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from kg_covid_19.transform_utils.chembl.chembl_transform import ChemblTransform
from kg_covid_19.transform_utils.drug_central.drug_central import DrugCentralTransform
from kg_covid_19.transform_utils.gocam_transform.gocam_transform import GocamTransform
from kg_covid_19.transform_utils.intact.intact import IntAct
from kg_covid_19.transform_utils.ontology import OntologyTransform
from kg_covid_19.transform_utils.ontology.ontology_transform import ONTOLOGIES
Expand All @@ -22,13 +23,14 @@
'DrugCentralTransform': DrugCentralTransform,
'TTDTransform': TTDTransform,
'StringTransform': StringTransform,
'ScibiteCordTransform': ScibiteCordTransform,
# 'ScibiteCordTransform': ScibiteCordTransform,
'PharmGKB': PharmGKB,
'SARSCoV2GeneAnnot': SARSCoV2GeneAnnot,
'IntAct': IntAct,
'GoTransform': OntologyTransform,
'HpTransform': OntologyTransform,
'MondoTransform': OntologyTransform,
'GocamTransform': GocamTransform,
'ChemblTransform': ChemblTransform
}

Expand Down
5 changes: 5 additions & 0 deletions kg_covid_19/transform_utils/gocam_transform/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .gocam_transform import GocamTransform

__all__ = [
"GocamTransform"
]
100 changes: 100 additions & 0 deletions kg_covid_19/transform_utils/gocam_transform/gocam_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import gzip
import os
import shutil
from typing import Optional

from kgx import RdfTransformer, PandasTransformer # type: ignore

from kg_covid_19.transform_utils.transform import Transform


class GocamTransform(Transform):
"""
GocamTransform parses GO-CAMs that have been subjected to
RDF edge project (REP) pattern.
"""
def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "GOCAMs"
super().__init__(source_name, input_dir, output_dir)

def run(self, data_file: Optional[str] = None, **kwargs) -> None:
"""Method is called and performs needed transformations to process
an ontology.
Args:
data_file: data file to parse
Returns:
None.
"""
if not data_file:
data_file = os.path.join(self.input_base_dir, 'lifted-go-cams-20200619.xml.gz')

if data_file.endswith('.gz'):
print("Decompressing")
decompressed_data_file = '.'.join(data_file.split('.')[0:-1])
self.decompress_file(data_file, decompressed_data_file)
else:
decompressed_data_file = data_file

if 'input_format' in kwargs:
input_format = kwargs['input_format']
if input_format not in {'nt', 'ttl', 'rdf/xml'}:
raise ValueError(f"Unsupported input_format: {input_format}")
else:
input_format = None
self.parse(decompressed_data_file, input_format)

def parse(self, data_file: str, input_format: str) -> None:
"""Processes the data_file.
Args:
data_file: data file to parse
input_format: format of input file
Returns:
None
"""
# define prefix to IRI mappings
cmap = {
'REACT': 'http://purl.obolibrary.org/obo/go/extensions/reacto.owl#REACTO_',
'WB': 'http://identifiers.org/wormbase/',
'FB': 'http://identifiers.org/flybase/',
'LEGO': 'http://geneontology.org/lego/',
'GOCAM': 'http://model.geneontology.org/',
'TAIR.LOCUS': 'http://identifiers.org/tair.locus/',
'POMBASE': 'http://identifiers.org/PomBase',
'DICTYBASE.GENE': 'http://identifiers.org/dictybase.gene/',
'XENBASE': 'http://identifiers.org/xenbase/'
}

# define predicates that are to be treated as node properties
np = {
'http://geneontology.org/lego/evidence',
'https://w3id.org/biolink/vocab/subjectActivity',
'https://w3id.org/biolink/vocab/objectActivity',
}

print(f"Parsing {data_file}")
transformer = RdfTransformer(curie_map=cmap)
transformer.parse(data_file, node_property_predicates=np, input_format=input_format)
output_transformer = PandasTransformer(transformer.graph)
output_transformer.save(os.path.join(self.output_dir, self.source_name), output_format='tsv', mode=None)

def decompress_file(self, input_file: str, output_file: str):
"""Decompress a file.
Args:
input_file: Input file
output_file: Output file
Returns:
str
"""
FH = gzip.open(input_file, 'rb')
WH = open(output_file, 'wb')
WH.write(FH.read())
return output_file
18 changes: 7 additions & 11 deletions kg_covid_19/transform_utils/ontology/ontology_transform.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import os
import logging

from typing import Optional

from kg_covid_19.transform_utils.transform import Transform
from kgx import PandasTransformer, ObographJsonTransformer # type: ignore
from kgx import PandasTransformer, ObographJsonTransformer # type: ignore


ONTOLOGIES = {
'HpTransform': 'hp.json',
'GoTransform': 'go-plus.json',
'MondoTransform': 'mondo.json'
'MondoTransform': 'mondo.json',
}


class OntologyTransform(Transform):
"""
OntologyTransform parses an Obograph JSON form of an Ontology into nodes nad edges.
Expand All @@ -22,13 +24,10 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
def run(self, data_file: Optional[str] = None) -> None:
"""Method is called and performs needed transformations to process
an ontology.
Args:
data_file: data file to parse
Returns:
None.
"""
if data_file:
k = data_file.split('.')[0]
Expand All @@ -42,18 +41,15 @@ def run(self, data_file: Optional[str] = None) -> None:

def parse(self, name: str, data_file: str, source: str) -> None:
"""Processes the data_file.
Args:
name: Name of the ontology
data_file: data file to parse
source: Source name
Returns:
None.
"""
logging.info(f"Parsing {data_file}")
print(f"Parsing {data_file}")
transformer = ObographJsonTransformer()
transformer.parse(data_file, provided_by=source)
output_transformer = PandasTransformer(transformer.graph)
output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
13 changes: 13 additions & 0 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
# -*- coding: utf-8 -*-
import gzip
import logging
import os
import re
import shutil
import tempfile
import zipfile
from typing import Any, Dict, List, Union
from tqdm import tqdm # type: ignore
Expand Down Expand Up @@ -157,6 +160,16 @@ def unzip_to_tempdir(zip_file_name: str, tempdir: str) -> None:
z.extractall(tempdir)


def ungzip_to_tempdir(gzipped_file: str, tempdir: str) -> str:
ungzipped_file = os.path.join(tempdir, os.path.basename(gzipped_file))
if ungzipped_file.endswith('.gz'):
ungzipped_file = os.path.splitext(ungzipped_file)[0]

with gzip.open(gzipped_file, 'rb') as f_in, open(ungzipped_file, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
return ungzipped_file


def guess_bl_category(identifier: str) -> str:
"""Guess category for a given identifier.
Expand Down
5 changes: 5 additions & 0 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ target:
filename:
- data/transformed/ontologies/hp_nodes.tsv
- data/transformed/ontologies/hp_edges.tsv
go-cams:
type: tsv
filename:
- data/transformed/GOCAMs/GOCAMs_nodes.tsv
- data/transformed/GOCAMs/GOCAMs_edges.tsv
destination:
merged-kg-tsv:
type: tsv
Expand Down

0 comments on commit 7a43ed2

Please sign in to comment.