Skip to content

Commit

Permalink
Added PRO SARS-CoV-2 proteins
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed May 14, 2020
1 parent ac71a0f commit a3a27c8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 13 deletions.
34 changes: 26 additions & 8 deletions kg_covid_19/query_utils/target_candidates/target_candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ def run(self):
# read in data files
logging.info("reading in data files")
sars_cov2_df = pd.read_csv(self.sars_cov_2_nodes, sep="\t")
merged_edges_df = pd.read_csv(os.path.join(self.input_dir,
intact_edges_df = pd.read_csv(os.path.join(self.input_dir,
self.intact_edges_file),
sep='\t')
merged_nodes_df = pd.read_csv(os.path.join(self.input_dir,
intact_nodes_df = pd.read_csv(os.path.join(self.input_dir,
self.intact_nodes_file),
sep='\t')

Expand All @@ -79,9 +79,15 @@ def run(self):
'V', 'id', 'name', 1,
"annotated SARS-CoV-2 gene"))

sars_cov2_ids_plus_pro = [c[1] for c in candidates]

logging.info("adding SARS-CoV-2 proteins present in IntAct")
candidates.extend(
self.sars_cov2_in_intact_set_to_candidate_entries()
self.sars_cov2_in_intact_set_to_candidate_entries(
existing_ids=sars_cov2_ids_plus_pro,
taxon_id=2697049,
nodes_df=intact_nodes_df,
taxid_col='ncbi_taxid')
)

all_sars_cov2_ids = [c[1] for c in candidates]
Expand All @@ -93,8 +99,8 @@ def run(self):
self.sars_cov2_human_interactors_to_candidate_entries(
sars_cov2_ids=all_sars_cov2_ids,
provided_by='intact',
edge_df=merged_edges_df,
nodes_df=merged_nodes_df,
edge_df=intact_edges_df,
nodes_df=intact_nodes_df,
viral_or_host="H",
subject_and_object_columns=['subject', 'object'],
id_col_in_node_tsv='id',
Expand All @@ -112,16 +118,28 @@ def run(self):

def sars_cov2_in_intact_set_to_candidate_entries(self,
existing_ids: list,
taxon_id: str,
nodes_df):
taxon_id: int,
nodes_df,
taxid_col: str,
id_col = 'id',
name_col = 'name'
):
"""Extract list of SARS-CoV-2 protein from IntAct nodes file
:param taxid_col: column name with taxon id
:param existing_ids: exclude entries present in this list
:param taxon_id: taxon ID to search for
:param nodes_df: pandas dataframe for intact nodes
:return:
"""
return [1]

candidate_entries: list = []
rows = nodes_df[nodes_df[taxid_col] == taxon_id]
for _, row in rows.iterrows():
if row[id_col] not in existing_ids:
candidate_entries.append(['V', row[id_col], row[name_col], 1,
'present in intact database'])
return candidate_entries

def sars_cov2_pro_candidates(self,
these_ids: list,
Expand Down
11 changes: 6 additions & 5 deletions tests/test_targetCandidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ def test_sars_cov2_in_intact_set_to_candidate_entries(self):
self.assertTrue(hasattr(self.tc,
'sars_cov2_in_intact_set_to_candidate_entries'))
existing_ids = ['UniProtKB:P0DTC2'] # don't want these again
nodes_df = pd.read_csv("tests/resources/P0DTC1.nodes.tsv", sep="\t")
nodes_df = pd.read_csv("tests/resources/test_sars_cov2_intact_nodes.tsv",
sep="\t")
candidates = self.tc.sars_cov2_in_intact_set_to_candidate_entries(
existing_ids=existing_ids,
taxon_id='2697049',
nodes_df=nodes_df
)
existing_ids=existing_ids,
taxon_id=2697049,
nodes_df=nodes_df,
taxid_col='ncbi_taxid')
self.assertEqual(1, len(candidates))
self.assertEqual(
['V',
Expand Down

0 comments on commit a3a27c8

Please sign in to comment.