Skip to content

Commit

Permalink
Merge master into Jenkins
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Apr 9, 2020
2 parents d2c30ad + 835a442 commit 4911c26
Show file tree
Hide file tree
Showing 13 changed files with 426 additions and 110 deletions.
20 changes: 20 additions & 0 deletions .github/ISSUE_TEMPLATE/dataset-request.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
name: Dataset request
about: Suggest a dataset to be ingested for this project
title: ''
labels: ''
assignees: ''

---

**Name of the dataset**
A clear and concise description of the dataset and where you can download it from.

**Mapping or relevant fields**
A clear and concise description of what which fields you would want to be ingested.

If possible, highlight which fields map to nodes and which fields map to edges.
Refer to [Data Preparation](https://github.com/NCATS-Tangerine/kgx/blob/master/data-preparation.md) for guidelines on how the final transformed data should be represented.

**Additional context**
Add any other context, requests, concerns.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include /kg_covid_19/transform_utils/string_ppi/*.json
14 changes: 13 additions & 1 deletion download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
#
-
# HPO obo file
url: https://raw.githubusercontent.com/obophenotype/human-phenotype-ontology/master/hp.obo
url: http://purl.obolibrary.org/obo/hp.obo
local_name: hp.obo
#
# TTD - Therapeutic Targets Database
Expand All @@ -71,6 +71,18 @@
url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
local_name: HUMAN_9606_idmapping.dat.gz
#
# SciBite CORD-19 annotations v1.2
#
-
url: https://github.com/SciBiteLabs/CORD19/raw/d2fb4998a504bcb06f5d0fd407f58feda2f140b1/annotated-CORD-19/1.2/CORD-19_1_2.zip
local_name: CORD-19_1_2.zip

# SciBite CORD-19 entity co-occurrences v1.0
-
url: https://github.com/SciBiteLabs/CORD19/raw/master/sentence-co-occurrence-CORD-19/1.0/cv19_scc.zip
local_name: cv19_scc.zip

#
# PharmGKB - drug -> drug target info, and gene id mappings
#
-
Expand Down
2 changes: 2 additions & 0 deletions kg_covid_19/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from kg_covid_19.transform_utils.drug_central.drug_central import DrugCentralTransform
from kg_covid_19.transform_utils.hpo.hpo import HpoTransform
from kg_covid_19.transform_utils.scibite_cord import ScibiteCordTransform
from kg_covid_19.transform_utils.pharmgkb import PharmGKB
from kg_covid_19.transform_utils.string_ppi import StringTransform
from kg_covid_19.transform_utils.ttd.ttd import TTDTransform
Expand All @@ -17,6 +18,7 @@
'HpoTransform': HpoTransform,
'TTDTransform': TTDTransform,
'StringTransform': StringTransform,
'ScibiteCordTransform': ScibiteCordTransform,
'PharmGKB': PharmGKB
}

Expand Down
5 changes: 5 additions & 0 deletions kg_covid_19/transform_utils/scibite_cord/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .scibite_cord import ScibiteCordTransform

__all__ = [
"ScibiteCordTransform"
]
272 changes: 272 additions & 0 deletions kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import json
import os
import uuid
from typing import List, Dict, Any, Set
from zipfile import ZipFile
import pandas as pd # type: ignore
from prefixcommons import contract_uri # type: ignore

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils import write_node_edge_item


class ScibiteCordTransform(Transform):
"""
ScibiteCordTransform parses the SciBite annotations on CORD-19 dataset
to extract concept to publication annotations and co-occurrences.
"""
def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "SciBite-CORD-19"
super().__init__(source_name, input_dir, output_dir)
self.concept_name_map: Dict = {}
self.seen: Set = set()

def run(self, data_files: List = None) -> None:
"""Method is called and performs needed transformations to process
annotations from SciBite CORD-19
Args:
data_files: data files to parse
Returns:
None.
"""
if not data_files:
data_files = list()
data_files.append(os.path.join(self.input_base_dir, "CORD-19_1_2.zip"))
data_files.append(os.path.join(self.input_base_dir, "cv19_scc.zip"))

self.node_header = ['id', 'name', 'category', 'description']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
node_handle = open(self.output_node_file, 'w')
edge_handle = open(self.output_edge_file, 'w')
node_handle.write("\t".join(self.node_header) + "\n")
edge_handle.write("\t".join(self.edge_header) + "\n")
self.parse_annotations(node_handle, edge_handle, data_files[0])
self.parse_cooccurrence(node_handle, edge_handle, data_files[1])

def parse_annotations(self, node_handle: Any, edge_handle: Any, data_file: str) -> None:
"""Parse annotations from CORD-19_1_2.zip.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
data_file: Path to CORD-19_1_2.zip.
Returns:
None.
"""
with ZipFile(data_file, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)

subsets = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'custom_license']
for subset in subsets:
data_dir = os.path.join(self.input_base_dir, 'data', subset, subset)
for filename in os.listdir(data_dir):
file = os.path.join(data_dir, filename)
doc = json.load(open(file))
self.parse_annotation_doc(node_handle, edge_handle, doc, subset)

def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str = None) -> None:
"""Parse a JSON document corresponding to a publication.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
doc: JSON document as dict.
subset: The subset name for this dataset.
Returns:
None.
"""
paper_id = doc['paper_id']
metadata = doc['metadata']
abstract = doc['abstract']
body_text = doc['body_text']
terms = set()
provided_by = f"{self.source_name}"
if subset:
provided_by += f" {subset}"
# extract hits from metadata
terms.update(self.extract_termite_hits(metadata))
# extract hits from abstract
for x in abstract:
terms.update(self.extract_termite_hits(x))
# extract hits from body text
for x in body_text:
terms.update(self.extract_termite_hits(x))

# add a biolink:Publication for each paper
write_node_edge_item(
fh=node_handle,
header=self.node_header,
data=[
f"CORD:{paper_id}",
f"{metadata['title']}",
"biolink:Publication",
""
]
)
self.seen.add(paper_id)

# TODO: use CURIE for terms
for t in terms:
if t not in self.seen:
# add a biolink:OntologyClass node for each term
write_node_edge_item(
fh=node_handle,
header=self.node_header,
data=[
f"{t}",
f"{self.concept_name_map[t]}",
"biolink:OntologyClass" if len(t) != 2 else "biolink:NamedThing",
""
]
)
self.seen.add(t)

# add has_annotation edge between OntologyClass and Publication
write_node_edge_item(
fh=edge_handle,
header=self.edge_header,
data=[
f"{t}",
f"biolink:related_to",
f"CORD:{paper_id}",
"SIO:000255",
provided_by
]
)

def parse_cooccurrence(self, node_handle: Any, edge_handle: Any, data_file: str) -> None:
"""Parse term co-occurrences from cv19_scc.zip.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
data_file: Path to cv19_scc.zip.
Returns:
None.
"""
with ZipFile(data_file, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)

df = pd.read_csv(os.path.join(self.input_base_dir, 'cv19_scc.tsv'), delimiter='\t', encoding='utf-8')
for index, row in df.iterrows():
self.parse_cooccurrence_record(node_handle, edge_handle, row)

def parse_cooccurrence_record(self, node_handle: Any, edge_handle: Any, record: Dict) -> None:
"""Parse term-cooccurrences.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
record: A dictionary corresponding to a row from a table.
Returns:
None.
"""
terms = set()
paper_id = record['document_id']
if not pd.isna(record['entity_uris']):
terms.update(record['entity_uris'].split('|'))
# add a biolink:Publication for each paper
if paper_id not in self.seen:
write_node_edge_item(
fh=node_handle,
header=self.node_header,
data=[
f"CORD:{paper_id}",
"",
"biolink:Publication",
""
]
)
self.seen.add(paper_id)

for t in terms:
if t not in self.seen:
# add a biolink:OntologyClass node for each term
write_node_edge_item(
fh=node_handle,
header=self.node_header,
data=[
f"{t}",
self.concept_name_map[t] if t in self.concept_name_map else "",
"biolink:OntologyClass" if len(t) != 2 else "biolink:NamedThing",
""
]
)
self.seen.add(t)

information_entity = uuid.uuid1()
write_node_edge_item(
fh=node_handle,
header=self.node_header,
data=[
f"{uuid.uuid1()}",
"",
"biolink:InformationContentEntity",
""
]
)
# add has_annotation edge between co-occurrence entity and publication
write_node_edge_item(
fh=edge_handle,
header=self.edge_header,
data=[
f"{information_entity}",
"biolink:related_to",
f"{record['document_id']}",
"SIO:000255", # 'has annotation'
f"{self.source_name}"
]
)
for t in terms:
# add has_member edges between co-occurrence entity and each term
write_node_edge_item(
fh=edge_handle,
header=self.edge_header,
data=[
f"{information_entity}",
"biolink:related_to",
f"{t}",
f"SIO:000059", # 'has member'
f"{self.source_name}"
]
)

def extract_termite_hits(self, data: Dict) -> Set:
"""Parse term-cooccurrences.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
data: A dictionary.
Returns:
None.
"""
terms = set()
termite_hits = data['termite_hits']
for k, v in termite_hits.items():
for hit in v:
terms.update([hit['id']])
if hit['id'] not in self.concept_name_map:
self.concept_name_map[hit['id']] = hit['name']
return terms








Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
"neighborhood",
"neighborhood_transferred",
"fusion",
"cooccurence",
"homology",
"coexpression",
"coexpression_transferred",
"experiments",
"experiments_transferred",
"database",
"database_transferred",
"textmining",
"textmining_transferred"
]
8 changes: 8 additions & 0 deletions kg_covid_19/transform_utils/string_ppi/edge_core_header.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
"subject",
"edge_label",
"object",
"relation",
"provided_by",
"combined_score"
]
7 changes: 7 additions & 0 deletions kg_covid_19/transform_utils/string_ppi/node_header.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[
"id",
"name",
"category",
"description",
"alias"
]
Loading

0 comments on commit 4911c26

Please sign in to comment.