Skip to content

Commit

Permalink
Refactor such that Uniprot IDs are node properties in xrefs column (i…
Browse files Browse the repository at this point in the history
…nstead of edges)
  • Loading branch information
justaddcoffee committed Jun 17, 2020
1 parent e388719 commit cf1672d
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 30 deletions.
3 changes: 2 additions & 1 deletion kg_covid_19/transform_utils/string_ppi/node_header.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@
"category",
"description",
"alias",
"xrefs",
"provided_by"
]
]
43 changes: 18 additions & 25 deletions kg_covid_19/transform_utils/string_ppi/string_ppi.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,15 @@
Write node and edge headers that look something like:
Node:
id name category
protein:1234 TBX4 Protein
id name category xrefs provided_by
protein:1234 TBX4 biolink:Protein UniprotKB:123456 STRING
xrefs contains the UniprotKB id for the protein, if available
Edge:
subject edge_label object relation
protein:1234 interacts_with protein:4567 RO:0002434
Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
subject edge_label object relation
protein:1234 bl:xrefs protein:4567 RO:0002434
"""

Expand Down Expand Up @@ -188,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None:
'biolink:Gene',
gene_informations['description'],
f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
"",
self.source_name
]
)
Expand All @@ -206,32 +206,25 @@ def run(self, data_file: Optional[str] = None) -> None:
# write node data
if protein not in seen_proteins:
seen_proteins.add(protein)
write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type, "", "", self.source_name]
)

# if we have an equivalent Uniprot ID for this Ensembl protein
# ID make an xref edge, and a node for the Uniprot ID
uniprot_curie = ''
if protein in string_to_uniprot_id_map:
uniprot_curie = \
f"UniprotKB:{string_to_uniprot_id_map[protein]}"
write_node_edge_item(
fh=node,
header=self.node_header,
data=[uniprot_curie, "",
protein_node_type, "", "", self.source_name])
write_node_edge_item(
fh=edge,
header=self.edge_header,
data=[f"ENSEMBL:{protein}",
"biolink:xrefs",
uniprot_curie,
"biolink:xrefs",
"uniprot",
] + self.extra_header)

write_node_edge_item(
fh=node,
header=self.node_header,
data=[f"ENSEMBL:{protein}", "",
protein_node_type,
"",
"",
uniprot_curie, # xref
self.source_name]
)


# write edge data
write_node_edge_item(
Expand Down
11 changes: 7 additions & 4 deletions tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,26 @@ def test_nodes_file(self):
node_file = os.path.join(self.string_output_dir, "nodes.tsv")
self.assertTrue(os.path.isfile(node_file))
node_df = pd.read_csv(node_file, sep="\t", header=0)
self.assertEqual((11, 6), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias',
self.assertEqual((10, 7), node_df.shape)
self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
'provided_by'], list(node_df.columns))
self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
self.assertCountEqual(['ENSEMBL:ENSP00000000233',
'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
'ENSEMBL:ENSP00000371253'],
list(node_df.id.unique()))
self.assertCountEqual('UniprotKB:P84085',
node_df.loc[node_df['id'] ==
'ENSEMBL:ENSP00000000233'].xrefs.item())

def test_edges_file(self):
self.string.run()
edge_file = os.path.join(self.string_output_dir, "edges.tsv")
self.assertTrue(os.path.isfile(edge_file))
edge_df = pd.read_csv(edge_file, sep="\t", header=0)
self.assertEqual((10, 19), edge_df.shape)
self.assertEqual((9, 19), edge_df.shape)
self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
'combined_score', 'neighborhood', 'neighborhood_transferred',
'fusion', 'cooccurence', 'homology', 'coexpression',
Expand Down

0 comments on commit cf1672d

Please sign in to comment.