Skip to content

Commit

Permalink
Merge pull request #100 from deepakunni3/ontology-ingest
Browse files Browse the repository at this point in the history
Ontology ingest
  • Loading branch information
justaddcoffee committed Apr 20, 2020
2 parents 052a3fc + a961c50 commit f6c6abd
Show file tree
Hide file tree
Showing 15 changed files with 135 additions and 39 deletions.
20 changes: 13 additions & 7 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,6 @@
local_name: gene_info.gz

#
# Human phenotype ontology (HPO) - no annotations for now
#
-
# HPO obo file
url: http://purl.obolibrary.org/obo/hp.obo
local_name: hp.obo
#
# TTD - Therapeutic Targets Database
# drug targets, and associated data for each (drugs, ids, etc)
#
Expand Down Expand Up @@ -107,3 +100,16 @@
# http://geneontology.org/docs/gene-product-information-gpi-format/
url: ftp://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_sars-cov-2.gpi
local_name: uniprot_sars-cov-2.gpi

#
# Ontologies
#
-
url: http://purl.obolibrary.org/obo/hp.json
local_name: hp.json
-
url: http://purl.obolibrary.org/obo/mondo.json
local_name: mondo.json
-
url: http://current.geneontology.org/ontology/extensions/go-plus.json
local_name: go-plus.json
15 changes: 11 additions & 4 deletions kg_covid_19/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from kg_covid_19.transform_utils.drug_central.drug_central import DrugCentralTransform
from kg_covid_19.transform_utils.hpo.hpo import HpoTransform
from kg_covid_19.transform_utils.ontology import OntologyTransform
from kg_covid_19.transform_utils.ontology.ontology_transform import ONTOLOGIES
from kg_covid_19.transform_utils.\
sars_cov_2_gene_annot.sars_cov_2_gene_annot import SARSCoV2GeneAnnot
# from kg_covid_19.transform_utils.scibite_cord import ScibiteCordTransform
Expand All @@ -18,12 +20,14 @@
DATA_SOURCES = {
'ZhouTransform': ZhouTransform,
'DrugCentralTransform': DrugCentralTransform,
'HpoTransform': HpoTransform,
'TTDTransform': TTDTransform,
'StringTransform': StringTransform,
'ScibiteCordTransform': ScibiteCordTransform,
'ScibiteCordTransform': ScibiteCordTransform,
'PharmGKB': PharmGKB,
'SARSCoV2GeneAnnot': SARSCoV2GeneAnnot
'SARSCoV2GeneAnnot': SARSCoV2GeneAnnot,
'GoTransform': OntologyTransform,
'HpTransform': OntologyTransform,
'MondoTransform': OntologyTransform
}


Expand All @@ -49,4 +53,7 @@ def transform(input_dir: str, output_dir: str, sources: List[str] = None) -> Non
if source in DATA_SOURCES:
logging.info(f"Parsing {source}")
t = DATA_SOURCES[source](input_dir, output_dir)
t.run()
if source in ONTOLOGIES.keys():
t.run(ONTOLOGIES[source])
else:
t.run()
4 changes: 2 additions & 2 deletions kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
import os

from typing import Dict, List
from typing import Dict, List, Optional

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
Expand All @@ -28,7 +28,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
source_name = "drug_central"
super().__init__(source_name, input_dir, output_dir) # set some variables

def run(self, species="Homo sapiens") -> None:
def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> None:
"""Method is called and performs needed transformations to process the Drug
Central data, additional information
on this data can be found in the comment at the top of this script"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


import os
from typing import Optional

from kg_covid_19.transform_utils.transform import Transform

Expand All @@ -25,7 +26,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "some_unique_name"
super().__init__(source_name, input_dir, output_dir)

def run(self):
def run(self, data_file: Optional[str] = None):
# replace with downloaded data of for this source
input_file = os.path.join(
self.input_base_dir, "example_data.csv") # must exist already
Expand Down
3 changes: 2 additions & 1 deletion kg_covid_19/transform_utils/hpo/hpo.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from typing import Optional

import obonet # type: ignore
from typing.io import TextIO # type: ignore
Expand All @@ -24,7 +25,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "hpo"
super().__init__(source_name, input_dir, output_dir)

def run(self):
def run(self, data_file: Optional[str] = None):
self.node_header.extend(["comments", "description"])
hpo_node_type = "biolink:PhenotypicFeature"
hpo_edge_label = "rdfs:subClassOf"
Expand Down
5 changes: 5 additions & 0 deletions kg_covid_19/transform_utils/ontology/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .ontology_transform import OntologyTransform

__all__ = [
"OntologyTransform"
]
58 changes: 58 additions & 0 deletions kg_covid_19/transform_utils/ontology/ontology_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
import logging
from typing import Optional

from kg_covid_19.transform_utils.transform import Transform
from kgx import PandasTransformer, ObographJsonTransformer # type: ignore

ONTOLOGIES = {
'HpTransform': 'hp.json',
'GoTransform': 'go-plus.json',
'MondoTransform': 'mondo.json'
}

class OntologyTransform(Transform):
"""
OntologyTransform parses an Obograph JSON form of an Ontology into nodes nad edges.
"""
def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "ontologies"
super().__init__(source_name, input_dir, output_dir)

def run(self, data_file: Optional[str] = None) -> None:
"""Method is called and performs needed transformations to process
an ontology.
Args:
data_file: data file to parse
Returns:
None.
"""
if data_file:
k = data_file.split('.')[0]
data_file = os.path.join(self.input_base_dir, data_file)
self.parse(k, data_file)
else:
# load all ontologies
for k in ONTOLOGIES.keys():
data_file = os.path.join(self.input_base_dir, ONTOLOGIES[k])
self.parse(k, data_file)

def parse(self, name: str, data_file: str) -> None:
"""Processes the data_file.
Args:
name: Name of the ontology
data_file: data file to parse
Returns:
None.
"""
logging.info(f"Parsing {data_file}")
transformer = ObographJsonTransformer()
transformer.parse(data_file)
output_transformer = PandasTransformer(transformer.graph)
output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), extension='tsv', mode=None)
9 changes: 5 additions & 4 deletions kg_covid_19/transform_utils/pharmgkb/pharmgkb.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import tempfile
from collections import defaultdict
from io import TextIOBase
from typing import Optional, TextIO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import data_to_dict, parse_header, \
Expand Down Expand Up @@ -36,7 +37,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.uniprot_id_key = 'UniProtKB' # id in genes.tsv where UniProt id is located
self.key_parsed_ids = 'parsed_ids' # key to put ids in after parsing

def run(self):
def run(self, data_file: Optional[str] = None):
rel_zip_file_name = os.path.join(self.input_base_dir, "relationships.zip")
relationship_file_name = "relationships.tsv"
gene_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_genes.zip")
Expand Down Expand Up @@ -112,7 +113,7 @@ def run(self):
line_data=line_data)

def make_pharmgkb_edge(self,
fh: TextIOBase,
fh: TextIO,
line_data: dict
) -> None:

Expand Down Expand Up @@ -140,7 +141,7 @@ def make_pharmgkb_edge(self,
evidence])

def make_pharmgkb_gene_node(self,
fh: TextIOBase,
fh: TextIO,
this_id: str,
name: str,
biolink_type: str) -> None:
Expand All @@ -166,7 +167,7 @@ def get_uniprot_id(self,
return gene_id

def make_pharmgkb_chemical_node(self,
fh: TextIOBase,
fh: TextIO,
chem_id: str,
name: str,
biolink_type: str
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
import logging
import os
from typing import Generator, TextIO, List
from typing import Generator, TextIO, List, Optional

from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound

Expand All @@ -17,7 +17,7 @@

class SARSCoV2GeneAnnot(Transform):

def __init__(self, input_dir: str = None, output_dir: str = None):
def __init__(self, input_dir: Optional[str] = None, output_dir: str = None):
source_name = "sars_cov_2_gene_annot"
super().__init__(source_name, input_dir, output_dir)

Expand All @@ -38,7 +38,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
'part_of': 'BFO:0000050'
}

def run(self):
def run(self, data_file: str = None):

# file housekeeping
os.makedirs(self.output_dir, exist_ok=True)
Expand Down
12 changes: 7 additions & 5 deletions kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import re
import uuid
from typing import List, Dict, Any, Set
from typing import List, Dict, Any, Set, Optional
from zipfile import ZipFile
import pandas as pd # type: ignore
from prefixcommons import contract_uri # type: ignore
Expand Down Expand Up @@ -31,21 +31,23 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
self.gene_info_map: Dict = {}
self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])

def run(self, data_files: List = None) -> None:
def run(self, data_file: Optional[str] = None) -> None:
"""Method is called and performs needed transformations to process
annotations from SciBite CORD-19
Args:
data_files: data files to parse
data_file: data file to parse
Returns:
None.
"""
if not data_files:
data_files = list()
data_files = list()
if not data_file:
data_files.append(os.path.join(self.input_base_dir, "CORD-19_1_3.zip"))
data_files.append(os.path.join(self.input_base_dir, "cv19_scc.zip"))
else:
data_files.append(data_file)

self.node_header = ['id', 'name', 'category', 'description']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
Expand Down
4 changes: 2 additions & 2 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import gzip
import os
import compress_json # type: ignore
from typing import Dict, List, Any, Set
from typing import Dict, List, Any, Set, Optional

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority
Expand Down Expand Up @@ -110,7 +110,7 @@ def load_gene_info(self, input_dir: str, output_dir: str, species_id: List = Non
self.gene_info_map[ncbi_gene_identifier]['symbol'] = symbol
self.gene_info_map[ncbi_gene_identifier]['description'] = description

def run(self, data_file: str = None) -> None:
def run(self, data_file: Optional[str] = None) -> None:
"""Method is called and performs needed transformations to process
protein-protein interactions from the STRING DB data.
Expand Down
4 changes: 2 additions & 2 deletions kg_covid_19/transform_utils/transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from typing import Optional


class Transform:
Expand Down Expand Up @@ -26,6 +27,5 @@ def __init__(self, source_name, input_dir: str = None, output_dir: str = None):
self.output_edge_file = os.path.join(self.output_dir, "edges.tsv")
self.output_json_file = os.path.join(self.output_dir, "nodes_edges.json")


def run(self):
def run(self, data_file: Optional[str] = None):
pass
4 changes: 2 additions & 2 deletions kg_covid_19/transform_utils/ttd/ttd.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import os
import re
from typing import Union, List, Dict, Any
from typing import Union, List, Dict, Any, Optional

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils import write_node_edge_item
Expand All @@ -29,7 +29,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "ttd"
super().__init__(source_name, input_dir, output_dir)

def run(self) -> None:
def run(self, data_file: Optional[str] = None):
self.node_header.append("TTD_ID") # append ttd id for drug targets and drugs
ttd_file_name = os.path.join(self.input_base_dir,
"P1-01-TTD_target_download.txt")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


import os
from typing import Optional

from tabula import io # type: ignore

Expand Down Expand Up @@ -34,7 +35,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
source_name = "zhou_host_proteins"
super().__init__(source_name, input_dir, output_dir)

def run(self) -> None:
def run(self, data_file: Optional[str] = None):
"""Method is called and performs needed transformations to process the zhou host protein data, additional
information on this data can be found in the comment at the top of this script."""

Expand Down
24 changes: 19 additions & 5 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@ target:
filename:
- data/transformed/drug_central/nodes.tsv
- data/transformed/drug_central/edges.tsv
hpo:
type: tsv
filename:
- data/transformed/hpo/nodes.tsv
- data/transformed/hpo/edges.tsv

pharmgkb:
type: tsv
filename:
Expand Down Expand Up @@ -43,6 +39,24 @@ target:
- data/transformed/sars_cov_2_gene_annot/nodes.tsv
- data/transformed/sars_cov_2_gene_annot/edges.tsv

gene-ontology:
type: tsv
filename:
- data/transformed/ontologies/go-plus_nodes.tsv
- data/transformed/ontologies/go-plus_edges.tsv

mondo-ontology:
type: tsv
filename:
- data/transformed/ontologies/mondo_nodes.tsv
- data/transformed/ontologies/mondo_edges.tsv

hp-ontology:
type: tsv
filename:
- data/transformed/ontologies/hp_nodes.tsv
- data/transformed/ontologies/hp_edges.tsv

destination:
type: tsv
filename: merged-kg
Expand Down

0 comments on commit f6c6abd

Please sign in to comment.