Skip to content

Commit

Permalink
Merge 7fd71a3 into 4f8cd0f
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Sep 15, 2020
2 parents 4f8cd0f + 7fd71a3 commit 5fc5660
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 215 deletions.
7 changes: 6 additions & 1 deletion download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
# # brief comment about file, and optionally a local_name:
# url: http://curefordisease.org/some_data.txt
# local_name: some_data_more_chars_prevent_name_collision.pdf

#
# For downloading from S3 buckets, see here for information about what URL to use:
# https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html#access-bucket-intro
# Amazon S3 virtual hosted style URLs follow the format shown below:
# https://bucket-name.s3.Region.amazonaws.com/key_name
#
---
#
# Zhou host proteins - viral protein interactions
Expand Down
139 changes: 4 additions & 135 deletions kg_covid_19/merge_utils/merge_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
from typing import Dict, List
import yaml
import networkx as nx
from kgx import NeoTransformer
from kgx.cli.utils import get_file_types, get_transformer
from kgx.operations.graph_merge import merge_all_graphs
from kgx.operations.summarize_graph import generate_graph_stats
from kgx.cli.cli_utils import merge


def parse_load_config(yaml_file: str) -> Dict:
Expand All @@ -24,145 +21,17 @@ def parse_load_config(yaml_file: str) -> Dict:
config = yaml.load(YML, Loader=yaml.FullLoader)
return config

# For NT export, any property that needs to be treated
# as anything but xsd:string should be defined here.
PROPERTY_TYPES = {
'combined_score': 'xsd:float',
"neighborhood": 'xsd:float',
"neighborhood_transferred": 'xsd:float',
"fusion": 'xsd:float',
"cooccurence": 'xsd:float',
"homology": 'xsd:float',
"coexpression": 'xsd:float',
"coexpression_transferred": 'xsd:float',
"experiments": 'xsd:float',
"experiments_transferred": 'xsd:float',
"database": 'xsd:float',
"database_transferred": 'xsd:float',
"textmining": 'xsd:float',
"textmining_transferred": 'xsd:float'
}

def load_and_merge(yaml_file: str) -> nx.MultiDiGraph:
def load_and_merge(yaml_file: str, processes: int = 1) -> nx.MultiDiGraph:
"""Load and merge sources defined in the config YAML.
Args:
yaml_file: A string pointing to a KGX compatible config YAML.
processes: Number of processes to use.
Returns:
networkx.MultiDiGraph: The merged graph.
"""
config = parse_load_config(yaml_file)
transformers: List = []

# make sure all files exist before we start load
for key in config['target']:
target = config['target'][key]
logging.info("Checking that file exist for {}".format(key))
if target['type'] in get_file_types():
for f in target['filename']:
if not os.path.exists(f) or not os.path.isfile(f):
raise FileNotFoundError("File {} for transform {} in yaml file {} "
"doesn't exist! Dying.", f, key, yaml_file)

# read all the sources defined in the YAML
for key in config['target']:
target = config['target'][key]
logging.info("Loading {}".format(key))
if target['type'] in get_file_types():
# loading from a file
transformer = get_transformer(target['type'])()
if target['type'] in {'tsv', 'neo4j'}:
if 'filters' in target:
apply_filters(target, transformer)
for f in target['filename']:
transformer.parse(f, input_format='tsv')
transformer.graph.name = key
if 'operations' in target:
apply_operations(target, transformer)
transformers.append(transformer)
elif target['type'] == 'neo4j':
transformer = NeoTransformer(None, target['uri'], target['username'], target['password'])
if 'filters' in target:
apply_filters(target, transformer)
transformer.load()
if 'operations' in target:
apply_operations(target, transformer)
transformers.append(transformer)
transformer.graph.name = key
else:
logging.error("type {} not yet supported".format(target['type']))
stats_filename = f"{key}_stats.yaml"
generate_graph_stats(transformer.graph, key, stats_filename)

# merge all subgraphs into a single graph
merged_graph = merge_all_graphs([x.graph for x in transformers])
merged_graph.name = 'merged_graph'
generate_graph_stats(merged_graph, merged_graph.name, "merged_graph_stats.yaml", ['provided_by'], ['provided_by'])

# write the merged graph
if 'destination' in config:
for _, destination in config['destination'].items():
if destination['type'] == 'neo4j':
destination_transformer = NeoTransformer(
merged_graph,
uri=destination['uri'],
username=destination['username'],
password=destination['password']
)
destination_transformer.save()
elif destination['type'] in get_file_types():
destination_transformer = get_transformer(destination['type'])(merged_graph)
mode = 'w:gz' if destination['type'] in {'tsv'} else None
if destination['type'] in {'nt', 'nt.gz', 'ttl'}:
destination_transformer.set_property_types(PROPERTY_TYPES)
destination_transformer.save(destination['filename'], output_format=destination['type'], mode=mode)
else:
logging.error("type {} not yet supported for KGX load-and-merge operation.".format(destination['type']))

merged_graph = merge(yaml_file, processes=processes)
return merged_graph


def apply_filters(target, transformer):
"""Apply filters as defined in the YAML.
Args:
target: The target from the YAML
transformer: The transformer corresponding to the target
Returns:
None
"""
filters = target['filters']
node_filters = filters['node_filters'] if 'node_filters' in filters else {}
edge_filters = filters['edge_filters'] if 'edge_filters' in filters else {}
for k, v in node_filters.items():
transformer.set_node_filter(k, set(v))
for k, v in edge_filters.items():
transformer.set_edge_filter(k, set(v))
logging.info(f"with node filters: {node_filters}")
logging.info(f"with edge filters: {edge_filters}")


def apply_operations(target, transformer):
"""Apply operations as defined in the YAML.
Args:
target: The target from the YAML
transformer: The transformer corresponding to the target
Returns:
None
"""
operations = target['operations']
for operation in operations:
op_name = operation['name']
op_args = operation['args']
module_name = '.'.join(op_name.split('.')[0:-1])
function_name = op_name.split('.')[-1]
f = getattr(importlib.import_module(module_name), function_name)
logging.info(f"Applying operation {op_name} with args: {op_args}")
f(transformer.graph, **op_args)
161 changes: 97 additions & 64 deletions merge.yaml
Original file line number Diff line number Diff line change
@@ -1,100 +1,133 @@
---
target:
drug-central:
configuration:
output_directory: data/merged
checkpoint: false
property_types:
# define the type for non-canonical node/edge properties
combined_score: 'xsd:float'
confidence_score: 'xsd:float'
neighborhood: 'xsd:float'
neighborhood_transferred: 'xsd:float'
fusion: 'xsd:float'
cooccurence: 'xsd:float'
homology: 'xsd:float'
coexpression: 'xsd:float'
coexpression_transferred: 'xsd:float'
experiments: 'xsd:float'
experiments_transferred: 'xsd:float'
database: 'xsd:float'
database_transferred: 'xsd:float'
textmining: 'xsd:float'
textmining_transferred: 'xsd:float'

merged_graph:
name: KG-COVID-19 Graph
targets:
drug-central:
type: tsv
filename:
- data/transformed/drug_central/nodes.tsv
- data/transformed/drug_central/edges.tsv
pharmgkb:
- data/transformed/drug_central/nodes.tsv
- data/transformed/drug_central/edges.tsv
pharmgkb:
type: tsv
filename:
- data/transformed/pharmgkb/nodes.tsv
- data/transformed/pharmgkb/edges.tsv
STRING:
- data/transformed/pharmgkb/nodes.tsv
- data/transformed/pharmgkb/edges.tsv
STRING:
type: tsv
filename:
- data/transformed/STRING/nodes.tsv
- data/transformed/STRING/edges.tsv
- data/transformed/STRING/nodes.tsv
- data/transformed/STRING/edges.tsv
filters:
node_filters:
category:
- biolink:Gene
- biolink:Protein
edge_filters:
subject_category:
- biolink:Gene
- biolink:Protein
object_category:
- biolink:Gene
- biolink:Protein
edge_label:
- biolink:interacts_with
- biolink:has_gene_product
node_filters:
category:
- biolink:Gene
- biolink:Protein
edge_filters:
subject_category:
- biolink:Gene
- biolink:Protein
object_category:
- biolink:Gene
- biolink:Protein
edge_label:
- biolink:interacts_with
- biolink:has_gene_product
operations:
- name: kgx.utils.graph_utils.remap_node_identifier
args:
- name: kgx.utils.graph_utils.remap_node_identifier
args:
category: biolink:Protein
alternative_property: xrefs
prefix: UniProtKB

ttd:
ttd:
type: tsv
filename:
- data/transformed/ttd/nodes.tsv
- data/transformed/ttd/edges.tsv
zhou-host-proteins:
- data/transformed/ttd/nodes.tsv
- data/transformed/ttd/edges.tsv
zhou-host-proteins:
type: tsv
filename:
- data/transformed/zhou_host_proteins/nodes.tsv
- data/transformed/zhou_host_proteins/edges.tsv
SciBite-CORD-19:
- data/transformed/zhou_host_proteins/nodes.tsv
- data/transformed/zhou_host_proteins/edges.tsv
SciBite-CORD-19:
type: tsv
filename:
- data/transformed/SciBite-CORD-19/nodes.tsv
- data/transformed/SciBite-CORD-19/edges.tsv
sars-cov-2-gene-annot:
- data/transformed/SciBite-CORD-19/nodes.tsv
- data/transformed/SciBite-CORD-19/edges.tsv
sars-cov-2-gene-annot:
type: tsv
filename:
- data/transformed/sars_cov_2_gene_annot/nodes.tsv
- data/transformed/sars_cov_2_gene_annot/edges.tsv
intact:
- data/transformed/sars_cov_2_gene_annot/nodes.tsv
- data/transformed/sars_cov_2_gene_annot/edges.tsv
intact:
type: tsv
filename:
- data/transformed/intact/nodes.tsv
- data/transformed/intact/edges.tsv
chembl:
- data/transformed/intact/nodes.tsv
- data/transformed/intact/edges.tsv
chembl:
type: tsv
filename:
- data/transformed/ChEMBL/nodes.tsv
- data/transformed/ChEMBL/edges.tsv
gene-ontology:
- data/transformed/ChEMBL/nodes.tsv
- data/transformed/ChEMBL/edges.tsv
gene-ontology:
type: tsv
filename:
- data/transformed/ontologies/go-plus_nodes.tsv
- data/transformed/ontologies/go-plus_edges.tsv
mondo-ontology:
- data/transformed/ontologies/go-plus_nodes.tsv
- data/transformed/ontologies/go-plus_edges.tsv
mondo-ontology:
type: tsv
filename:
- data/transformed/ontologies/mondo_nodes.tsv
- data/transformed/ontologies/mondo_edges.tsv
hp-ontology:
- data/transformed/ontologies/mondo_nodes.tsv
- data/transformed/ontologies/mondo_edges.tsv
hp-ontology:
type: tsv
filename:
- data/transformed/ontologies/hp_nodes.tsv
- data/transformed/ontologies/hp_edges.tsv
go-cams:
- data/transformed/ontologies/hp_nodes.tsv
- data/transformed/ontologies/hp_edges.tsv
go-cams:
type: tsv
filename:
- data/transformed/GOCAMs/GOCAMs_nodes.tsv
- data/transformed/GOCAMs/GOCAMs_edges.tsv
destination:
merged-kg-tsv:
- data/transformed/GOCAMs/GOCAMs_nodes.tsv
- data/transformed/GOCAMs/GOCAMs_edges.tsv
operations:
- name: kgx.operations.summarize_graph.generate_graph_stats
args:
graph_name: KG-COVID-19 Graph
filename: merged_graph_stats.yaml
node_facet_properties:
- provided_by
edge_facet_properties:
- provided_by
destination:
merged-kg-tsv:
type: tsv
filename: data/merged/merged-kg
merged-kg-nt:
type: nt.gz
filename: data/merged/merged-kg.nt.gz
# merged-kg-neo4j:
compression: tar.gz
filename: merged-kg
merged-kg-nt:
type: nt
compression: gz
filename: merged-kg.nt.gz
# merged-kg-neo4j:
# type: neo4j
# uri: http://localhost:8484
# username: neo4j
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
git+git://github.com/deepakunni3/kgx
git+git://github.com/deepakunni3/kgx@kg-covid-19
6 changes: 4 additions & 2 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,20 @@ def transform(*args, **kwargs) -> None:

@cli.command()
@click.option('yaml', '-y', default="merge.yaml", type=click.Path(exists=True))
def merge(yaml: str) -> None:
@click.option('processes', '-p', default=1, type=int)
def merge(yaml: str, processes: int) -> None:
"""Use KGX to load subgraphs to create a merged graph.
Args:
yaml: A string pointing to a KGX compatible config YAML.
processes: Number of processes to use.
Returns:
None.
"""

load_and_merge(yaml)
load_and_merge(yaml, processes)


@cli.command()
Expand Down
Loading

0 comments on commit 5fc5660

Please sign in to comment.