Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…vid-19 into include_yaml_manifest_in_tar
  • Loading branch information
justaddcoffee committed Jul 10, 2020
2 parents 9927015 + 6dfa60d commit e79bcaa
Show file tree
Hide file tree
Showing 15 changed files with 160 additions and 99 deletions.
1 change: 1 addition & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pipeline {
)
sh '/usr/bin/python3.7 -m venv venv'
sh '. venv/bin/activate'
sh './venv/bin/pip install wheel'
sh './venv/bin/pip install bmt'
sh './venv/bin/pip install -r requirements.txt'
sh './venv/bin/python setup.py install'
Expand Down
19 changes: 0 additions & 19 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,6 @@ Documentation

See the `repository's wiki <https://github.com/kg-emerging-viruses/kg-emerging-viruses/wiki>`_.

How do I install this package?
----------------------------------------------

.. code:: shell
git clone https://github.com/Knowledge-Graph-Hub/kg-covid-19
cd kg-covid-19
pip install .
pip install -r requirements.txt
How do I use this package?
----------------------------------------------

.. code:: shell
python run.py download
python run.py transform
python run.py merge

Tests Coverage
----------------------------------------------
Expand Down
131 changes: 78 additions & 53 deletions curated/ORFs/uniprot_sars-cov-2.gpi

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,15 @@
local_name: HUMAN_9606_idmapping.dat.gz

#
# SciBite CORD-19 annotations v1.4
# SciBite CORD-19 annotations v1.5
#
-
url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.4/CORD-19_1_4.zip
local_name: CORD-19_1_4.zip
url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.5/pdf_json.zip
local_name: pdf_json.zip

-
url: https://media.githubusercontent.com/media/SciBiteLabs/CORD19/master/annotated-CORD-19/1.5/pmc_json.zip
local_name: pmc_json.zip

# SciBite CORD-19 entity co-occurrences v1.2
-
Expand Down
4 changes: 2 additions & 2 deletions kg_covid_19/merge_utils/merge_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
# merge all subgraphs into a single graph
merged_graph = merge_all_graphs([x.graph for x in transformers])
merged_graph.name = 'merged_graph'
generate_graph_stats(merged_graph, merged_graph.name, f"merged_graph_stats.yaml")
generate_graph_stats(merged_graph, merged_graph.name, "merged_graph_stats.yaml", ['provided_by'], ['provided_by'])

# write the merged graph
if 'destination' in config:
Expand All @@ -96,7 +96,7 @@ def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
destination_transformer.save()
elif destination['type'] in get_file_types():
destination_transformer = get_transformer(destination['type'])(merged_graph)
destination_transformer.save(destination['filename'], extension=destination['type'])
destination_transformer.save(destination['filename'], output_format=destination['type'])
else:
logging.error("type {} not yet supported for KGX load-and-merge operation.".format(destination['type']))

Expand Down
1 change: 1 addition & 0 deletions kg_covid_19/transform_utils/chembl/chembl_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def parse_chembl_molecules(self, data: List):
remap = {
'pref_name': 'name',
'full_molformula': 'molecular_formula',
'synonyms': 'synonym'
}
self._node_header.update([remap[x] if x in remap else x for x in allowed_properties])

Expand Down
2 changes: 1 addition & 1 deletion kg_covid_19/transform_utils/ontology/ontology_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,4 @@ def parse(self, name: str, data_file: str, source: str) -> None:
transformer = ObographJsonTransformer()
transformer.parse(data_file, provided_by=source)
output_transformer = PandasTransformer(transformer.graph)
output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), extension='tsv', mode=None)
output_transformer.save(filename=os.path.join(self.output_dir, f'{name}'), output_format='tsv', mode=None)
32 changes: 20 additions & 12 deletions kg_covid_19/transform_utils/scibite_cord/scibite_cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,49 +41,59 @@ def run(self, data_file: Optional[str] = None) -> None:
Args:
data_file: data file to parse
Should be:
[pdf_json.zip, pmc_json.zip, cv19_scc_1_2.zip]
Returns:
None.
"""
data_files = list()
if not data_file:
data_files.append(os.path.join(self.input_base_dir, "CORD-19_1_4.zip"))
data_files.append(os.path.join(self.input_base_dir, "pdf_json.zip"))
data_files.append(os.path.join(self.input_base_dir, "pmc_json.zip"))

data_files.append(os.path.join(self.input_base_dir, "cv19_scc_1_2.zip"))
else:
data_files.append(data_file)
data_files.extend(data_files)

self.node_header = ['id', 'name', 'category', 'description', 'provided_by']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by']
node_handle = open(self.output_node_file, 'w')
edge_handle = open(self.output_edge_file, 'w')
node_handle.write("\t".join(self.node_header) + "\n")
edge_handle.write("\t".join(self.edge_header) + "\n")
self.parse_annotations(node_handle, edge_handle, data_files[0])
self.parse_annotations(node_handle, edge_handle, data_files[0], data_files[1])

node_handle = open(os.path.join(self.output_dir, "entity_cooccurrence_nodes.tsv"), 'w')
edge_handle = open(os.path.join(self.output_dir, "entity_cooccurrence_edges.tsv"), 'w')
node_handle.write("\t".join(self.node_header) + "\n")
edge_handle.write("\t".join(self.edge_header) + "\n")
self.parse_cooccurrence(node_handle, edge_handle, data_files[1])
self.parse_cooccurrence(node_handle, edge_handle, data_files[2])

def parse_annotations(self, node_handle: Any, edge_handle: Any, data_file: str) -> None:
"""Parse annotations from CORD-19_1_2.zip.
def parse_annotations(self, node_handle: Any, edge_handle: Any,
data_file1: str,
data_file2: str) -> None:
"""Parse annotations from CORD-19_1_5.zip.
Args:
node_handle: File handle for nodes.csv.
edge_handle: File handle for edges.csv.
data_file: Path to CORD-19_1_2.zip.
data_file1: Path to first CORD-19_1_5.zip.
data_file2: Path to second CORD-19_1_5.zip.
Returns:
None.
"""
with ZipFile(data_file, 'r') as ZF:
with ZipFile(data_file1, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)
with ZipFile(data_file2, 'r') as ZF:
ZF.extractall(path=self.input_base_dir)

subsets = ['biorxiv_medrxiv', 'comm_use_subset', 'noncomm_use_subset', 'custom_license']
subsets = ['pmc_json', 'pdf_json']
for subset in subsets:
subset_dir = os.path.join(self.input_base_dir, 'CORD19', subset, subset)
subset_dir = os.path.join(self.input_base_dir, subset)
for data_dir in os.listdir(subset_dir):
if os.path.isdir(os.path.join(subset_dir, data_dir)):
for filename in os.listdir(os.path.join(subset_dir, data_dir)):
Expand Down Expand Up @@ -126,8 +136,6 @@ def parse_annotation_doc(self, node_handle, edge_handle, doc: Dict, subset: str
terms.update(self.extract_termite_hits(x))

provided_by = f"{self.source_name}"
if subset:
provided_by += f" {subset}"

# add a biolink:Publication for each paper
write_node_edge_item(
Expand Down
1 change: 0 additions & 1 deletion kg_covid_19/transform_utils/string_ppi/node_header.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"name",
"category",
"description",
"alias",
"xrefs",
"provided_by"
]
5 changes: 2 additions & 3 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, uniprot_make_name_to_id_mapping
get_item_by_priority, uniprot_make_name_to_id_mapping, collapse_uniprot_curie

"""
Ingest protein-protein interactions from STRING DB.
Expand Down Expand Up @@ -187,7 +187,6 @@ def run(self, data_file: Optional[str] = None) -> None:
'biolink:Gene',
gene_informations['description'],
f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
"",
self.source_name
]
)
Expand All @@ -213,14 +212,14 @@ def run(self, data_file: Optional[str] = None) -> None:
if protein in string_to_uniprot_id_map:
uniprot_curie = \
f"UniProtKB:{string_to_uniprot_id_map[protein]}"
uniprot_curie = collapse_uniprot_curie(uniprot_curie)

write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type,
"",
"",
uniprot_curie, # xref
self.source_name]
)
Expand Down
14 changes: 14 additions & 0 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# -*- coding: utf-8 -*-
import gzip
import logging
import re
import zipfile
from typing import Any, Dict, List, Union
from tqdm import tqdm # type: ignore
Expand Down Expand Up @@ -176,3 +177,16 @@ def guess_bl_category(identifier: str) -> str:
else:
category = 'biolink:NamedThing'
return category


def collapse_uniprot_curie(uniprot_curie: str) -> str:
""" Given a UniProtKB curie for an isoform such as UniprotKB:P63151-1
or UniprotKB:P63151-2, collapse to parent protein
(UniprotKB:P63151 / UniprotKB:P63151)
:param uniprot_curie:
:return: collapsed UniProtKB ID
"""
if re.match(r'^uniprotkb:', uniprot_curie, re.IGNORECASE):
uniprot_curie = re.sub(r'\-\d+$', '', uniprot_curie)
return uniprot_curie
Binary file modified tests/resources/string/HUMAN_9606_idmapping.dat.gz
Binary file not shown.
21 changes: 21 additions & 0 deletions tests/test_gpi_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import unittest

from kg_covid_19.transform_utils.sars_cov_2_gene_annot.sars_cov_2_gene_annot import \
_gpi12iterator


class TestGpiFile(unittest.TestCase):
def setUp(self) -> None:
self.gpi_file = 'curated/ORFs/uniprot_sars-cov-2.gpi'
self.expected_sars_cov2_genes = 32

def test_gpi_file_exists(self):
self.assertTrue(os.path.exists(self.gpi_file))

def test_gpi_parsing(self):
count: int = 0
with open(self.gpi_file, 'r') as gpi_fh:
for rec in _gpi12iterator(gpi_fh):
count += 1
self.assertEqual(self.expected_sars_cov2_genes, count)
6 changes: 3 additions & 3 deletions tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def test_nodes_file(self):
node_file = os.path.join(self.string_output_dir, "nodes.tsv")
self.assertTrue(os.path.isfile(node_file))
node_df = pd.read_csv(node_file, sep="\t", header=0)
self.assertEqual((10, 7), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
self.assertEqual((10, 6), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'xrefs',
'provided_by'], list(node_df.columns))
self.assertCountEqual(['ENSEMBL:ENSP00000000233',
'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
Expand All @@ -66,7 +66,7 @@ def test_nodes_file(self):
'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
'ENSEMBL:ENSP00000371253'],
list(node_df.id.unique()))
self.assertEqual('UniProtKB:P84085',
self.assertEqual('UniProtKB:P84085', # isoform (-2) stripped off
node_df.loc[node_df['id'] ==
'ENSEMBL:ENSP00000000233'].xrefs.item())

Expand Down
12 changes: 10 additions & 2 deletions tests/test_transform_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest
from parameterized import parameterized
from kg_covid_19.utils.transform_utils import guess_bl_category
from kg_covid_19.utils.transform_utils import guess_bl_category, collapse_uniprot_curie


class TestTransformUtils(unittest.TestCase):
Expand All @@ -13,5 +13,13 @@ class TestTransformUtils(unittest.TestCase):
def test_guess_bl_category(self, curie, category):
self.assertEqual(category, guess_bl_category(curie))


@parameterized.expand([
['foobar', 'foobar'],
['ENSEMBL:ENSG00000178607', 'ENSEMBL:ENSG00000178607'],
['UniprotKB:P63151-1', 'UniprotKB:P63151'],
['uniprotkb:P63151-1', 'uniprotkb:P63151'],
['UniprotKB:P63151-2', 'UniprotKB:P63151'],
])
def test_collapse_uniprot_curie(self, curie, collapsed_curie):
self.assertEqual(collapsed_curie, collapse_uniprot_curie(curie))

0 comments on commit e79bcaa

Please sign in to comment.