Skip to content

Commit

Permalink
Merge a3a27c8 into b630715
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed May 14, 2020
2 parents b630715 + a3a27c8 commit 732be04
Show file tree
Hide file tree
Showing 14 changed files with 460 additions and 7 deletions.
14 changes: 14 additions & 0 deletions kg_covid_19/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import logging

from kg_covid_19.query_utils.target_candidates.target_candidates \
import TargetCandidates

QUERIES = {
'TargetCandidates': TargetCandidates
}


def run_query(query: str, input_dir: str, output_dir: str) -> None:
logging.info(f"Running query {query}")
t = QUERIES[query](input_dir, output_dir)
t.run()
Empty file.
8 changes: 8 additions & 0 deletions kg_covid_19/query_utils/query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class Query:
def __init__(self, query_name: str, input_dir: str, output_dir: str) -> None:
self.query_name = query_name
self.input_dir = input_dir
self.output_dir = output_dir

def run(self, input_dir: str, output_dir: str) -> None:
pass
Empty file.
277 changes: 277 additions & 0 deletions kg_covid_19/query_utils/target_candidates/target_candidates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
import csv
import logging
import os
import re
import tarfile
from typing import List, Union

from tqdm import tqdm # type: ignore

from kg_covid_19.query_utils.query import Query
import pandas as pd # type: ignore


class TargetCandidates(Query):
"""Perform a query of TSV files to generate a short list of protein targets
for further investigation.
A tsv file is emitted with these fields:
species H=host, V=viral
protein ID
protein/gene name
confidence score 0-1
Free text describing why target was chosen
This software is provided "as is" and any expressed or implied warranties,
including, but not limited to, the implied warranties of merchantability and
fitness for a particular purpose are disclaimed. in no event shall the authors or
contributors be liable for any direct, indirect, incidental, special, exemplary, or
consequential damages (including, but not limited to, procurement of substitute
goods or services; loss of use, data, or profits; or business interruption)
however caused and on any theory of liability, whether in contract, strict
liability, or tort (including negligence or otherwise) arising in any way out of
the use of this software, even if advised of the possibility of such damage.
"""

def __init__(self, input_dir: str, output_dir: str):
query_name = "target_candidates"
super().__init__(query_name, input_dir, output_dir) # set some variables
# data file locations
self.intact_nodes_file = os.path.join('transformed', 'intact', 'nodes.tsv')
self.intact_edges_file = os.path.join('transformed', 'intact', 'edges.tsv')
self.sars_cov_2_nodes = os.path.join('data', "transformed",
"sars_cov_2_gene_annot/nodes.tsv")
self.sars_cov_2_edges = os.path.join('data', "transformed",
"sars_cov_2_gene_annot/edges.tsv")
self.outfile_name = "target_candidates.tsv"
self.outfile_header = ["species",
"protein ID",
"protein/gene name",
"confidence score",
"comments"]

def run(self):
candidates: list = []

# read in data files
logging.info("reading in data files")
sars_cov2_df = pd.read_csv(self.sars_cov_2_nodes, sep="\t")
intact_edges_df = pd.read_csv(os.path.join(self.input_dir,
self.intact_edges_file),
sep='\t')
intact_nodes_df = pd.read_csv(os.path.join(self.input_dir,
self.intact_nodes_file),
sep='\t')

# add SARS-CoV-2 proteins from gene annotation transform
logging.info("adding annotated SARS-CoV-2 proteins from annotations")
candidates.extend(
self.sars_cov2_to_candidate_entries(sars_cov2_df,
'V', 'id', 'name', 1,
"annotated SARS-CoV-2 gene"))
sars_cov2_ids = [c[1] for c in candidates]

# append PRO_ SARS-CoV-2 proteins
logging.info("adding SARS-CoV-2 proteins PRO_ proteins")
candidates.extend(
self.sars_cov2_pro_candidates(sars_cov2_ids,
sars_cov2_df,
'V', 'id', 'name', 1,
"annotated SARS-CoV-2 gene"))

sars_cov2_ids_plus_pro = [c[1] for c in candidates]

logging.info("adding SARS-CoV-2 proteins present in IntAct")
candidates.extend(
self.sars_cov2_in_intact_set_to_candidate_entries(
existing_ids=sars_cov2_ids_plus_pro,
taxon_id=2697049,
nodes_df=intact_nodes_df,
taxid_col='ncbi_taxid')
)

all_sars_cov2_ids = [c[1] for c in candidates]

# append list of proteins that interact with SARS-CoV-2 according to IntAct
logging.info("adding human proteins that interact with SARS-CoV-2 proteins" +
"according to IntAct")
candidates.extend(
self.sars_cov2_human_interactors_to_candidate_entries(
sars_cov2_ids=all_sars_cov2_ids,
provided_by='intact',
edge_df=intact_edges_df,
nodes_df=intact_nodes_df,
viral_or_host="H",
subject_and_object_columns=['subject', 'object'],
id_col_in_node_tsv='id',
name_col='name',
confidence_score=0.75,
comments='inferred from intact')
)

os.makedirs(self.output_dir, exist_ok=True)
with open(os.path.join(self.output_dir, self.outfile_name), 'w', newline="") as out:
tsv_output = csv.writer(out, delimiter='\t')
tsv_output.writerow(self.outfile_header)
for candidate in candidates:
tsv_output.writerow(candidate)

def sars_cov2_in_intact_set_to_candidate_entries(self,
existing_ids: list,
taxon_id: int,
nodes_df,
taxid_col: str,
id_col = 'id',
name_col = 'name'
):
"""Extract list of SARS-CoV-2 protein from IntAct nodes file
:param taxid_col: column name with taxon id
:param existing_ids: exclude entries present in this list
:param taxon_id: taxon ID to search for
:param nodes_df: pandas dataframe for intact nodes
:return:
"""

candidate_entries: list = []
rows = nodes_df[nodes_df[taxid_col] == taxon_id]
for _, row in rows.iterrows():
if row[id_col] not in existing_ids:
candidate_entries.append(['V', row[id_col], row[name_col], 1,
'present in intact database'])
return candidate_entries

def sars_cov2_pro_candidates(self,
these_ids: list,
sars_cov2_df,
viral_or_host: str,
id_col: str,
name_col: str,
confidence_score: str,
comments: str):
"""Given sars-cov-2 gene annotations, extract and make candidate entry
:param IDs for which to look in nodes df for PRO_ genes
:param sars_cov2_df:
:param viral_or_host:
:param id_col:
:param name_col:
:param confidence_score:
:param comments:
:return: List - candidate entry in format described in docstring
"""
candidate_entries: list = []

all_ids = list(sars_cov2_df[id_col])
all_names = list(sars_cov2_df[name_col])
for id_to_match in these_ids:
this_re = re.compile(id_to_match)
matching_indices = [i for i, item in enumerate(all_ids) if this_re.search(item)]

for this_idx in matching_indices:
this_id = all_ids[this_idx]
this_name = all_names[this_idx]

if this_id == id_to_match: # don't match identical IDs
continue
candidate_entry = [viral_or_host, this_id, this_name, confidence_score, comments]
candidate_entries.append(candidate_entry)
return candidate_entries

def sars_cov2_to_candidate_entries(self,
sars_cov2_df,
viral_or_host: str,
id_col: str,
name_col: str,
confidence_score: str,
comments: str
) -> List[List[Union[str, int]]]:
"""Given sars-cov-2 gene annotations, extract and make candidate entry
:param comments:
:param confidence_score:
:param name_col:
:param id_col:
:param viral_or_host:
:param sars_cov2_df:
:return: List - candidate entry in format described in docstring
"""
candidate_entries: list = []

for _, row in sars_cov2_df.iterrows():
candidate_entry = [viral_or_host, None, None, confidence_score, comments]
if id_col in row:
candidate_entry[1] = row[id_col]
if name_col in row:
candidate_entry[2] = row[name_col]
candidate_entries.append(candidate_entry)
return candidate_entries

def sars_cov2_human_interactors_to_candidate_entries(self,
sars_cov2_ids: list,
provided_by: str,
edge_df,
nodes_df,
viral_or_host: str,
subject_and_object_columns: list,
id_col_in_node_tsv: str,
name_col: str,
confidence_score: float,
comments: str
) -> List[List[Union[str, int]]]:
"""Given sars-cov-2 genes and a pandas DF with edge data and node data, extract
all proteins that interact with these sars-cov2 genes according to source(s) in
'provided_by'
:param id_cols:
:param nodes_df:
:param sars_cov2_ids
:param provided_by
:param edge_df
:param node_df
:param viral_or_host:
:param id_col:
:param edge_df,
:param comments:
:param confidence_score:
:param name_col:
:param sars_cov2_df:
:return: List - candidate entry in format described in docstring
"""
candidate_entries: list = []

# find edges for 'provided_by'
these_edges = edge_df[edge_df.provided_by == provided_by]

interactor_ids: list = []

# find IDs of interest in id_cols_to_search
for this_id in tqdm(sars_cov2_ids):

for _, row in these_edges.iterrows():
if row[subject_and_object_columns[0]] == this_id:
interactor_col_name = subject_and_object_columns[1]
elif row[subject_and_object_columns[1]] == this_id:
interactor_col_name = subject_and_object_columns[0]
else:
continue

interactor_ids.append(row[interactor_col_name])

interactor_ids = list(set(interactor_ids))

for interactor_id in interactor_ids:
# find interactor name
name = "NA"
try:
node_rows = nodes_df[nodes_df[id_col_in_node_tsv] == interactor_id]
name = node_rows[name_col][0]
except KeyError:
logging.warning("Problem getting name for id: %s", interactor_id)
pass

candidate_entry = [viral_or_host, interactor_id, name, confidence_score,
comments]
candidate_entries.append(candidate_entry)

return candidate_entries
9 changes: 8 additions & 1 deletion kg_covid_19/transform_utils/intact/intact.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
self.pubmed_curie_prefix = 'PMID:'
self.ppi_edge_label = 'biolink:interacts_with'
self.ppi_ro_relation = 'RO:0002437'
self.node_header = ['id', 'name', 'category', 'ncbi_taxid']
self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by',
'publication', 'num_participants', 'association_type',
'detection_method', 'subj_exp_role', 'obj_exp_role']
Expand Down Expand Up @@ -233,6 +234,12 @@ def interactor_to_node(self, interactor) -> List[Union[int, list]]:
"Problem parsing id in xref interaction %s" % e)

name = ''

try:
tax_id = interactor.getElementsByTagName('organism')[0].attributes['ncbiTaxId'].value
except (KeyError, IndexError, AttributeError) as e:
tax_id = "NA"

try:
# xml parsing amirite
name = interactor.getElementsByTagName('names')[0].getElementsByTagName(
Expand All @@ -253,7 +260,7 @@ def interactor_to_node(self, interactor) -> List[Union[int, list]]:
logging.warning(
"Problem parsing name in xref interaction %s" % e)

return [interactor_id, [this_id, name, category]]
return [interactor_id, [this_id, name, category, tax_id]]

def parse_experiment_info(self, xmldoc: object) -> Dict[int, str]:
"""Extract info about experiment from miXML doc
Expand Down
31 changes: 28 additions & 3 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging

import click
from kg_covid_19 import download as kg_download
from kg_covid_19 import transform as kg_transform
from kg_covid_19.load_utils.merge_kg import load_and_merge
from kg_covid_19.query import QUERIES, run_query
from kg_covid_19.transform import DATA_SOURCES


@click.group()
def cli():
pass


@cli.command()
@click.option("yaml_file", "-y", required=True, default="download.yaml", type=click.Path(exists=True))
@click.option("yaml_file", "-y", required=True, default="download.yaml",
type=click.Path(exists=True))
@click.option("output_dir", "-o", required=True, default="data/raw")
@click.option("ignore_cache", "-i", is_flag=True, default=False,
help='ignore cache and download files even if they exist [false]')
Expand All @@ -40,7 +43,8 @@ def download(*args, **kwargs) -> None:
@cli.command()
@click.option("input_dir", "-i", default="data/raw", type=click.Path(exists=True))
@click.option("output_dir", "-o", default="data/transformed")
@click.option("sources", "-s", default=None, multiple=True, type=click.Choice(DATA_SOURCES.keys()))
@click.option("sources", "-s", default=None, multiple=True,
type=click.Choice(DATA_SOURCES.keys()))
def transform(*args, **kwargs) -> None:
"""Calls scripts in kg_covid_19/transform/[source name]/ to transform each source
into nodes and edges.
Expand Down Expand Up @@ -77,5 +81,26 @@ def load(yaml: str) -> None:
load_and_merge(yaml)


@cli.command()
@click.option("query", "-q", required=True, default=None, multiple=False,
type=click.Choice(QUERIES.keys()))
@click.option("input_dir", "-i", default="data/")
@click.option("output_dir", "-o", default="data/queries/")
def query(query: str, input_dir: str, output_dir: str) -> None:
"""Perform a query of knowledge graph using a class contained in query_utils
Args:
query: A query class containing instructions for performing a query
input_dir: Directory where any input files required to execute query are
located (typically 'data', where transformed and merged graph files are)
output_dir: Directory to output results of query
Returns:
None.
"""
run_query(query=query, input_dir=input_dir, output_dir=output_dir)


if __name__ == "__main__":
cli()
Loading

0 comments on commit 732be04

Please sign in to comment.