From b287df50e5d2ddc8a8134a149383103b6bc6361b Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 15:48:27 -0700 Subject: [PATCH 01/10] Added item in STRING section of download.yaml for HUMAN_9606_idmapping.dat.gz --- download.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/download.yaml b/download.yaml index e9a948bd..0d8ea12f 100644 --- a/download.yaml +++ b/download.yaml @@ -54,6 +54,13 @@ url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz local_name: gene_info.gz +- + # this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235) + # to be ID mapped in merge step + # nb: we are also downloading and using this file for the TTD transform + url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz + local_name: HUMAN_9606_idmapping.dat.gz + # # TTD - Therapeutic Targets Database # drug targets, and associated data for each (drugs, ids, etc) From 651a37e5dcad2dccb18bbaf0a10e5ba5c75adbd7 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 18:40:38 -0700 Subject: [PATCH 02/10] Some tests for STRING --- .../transform_utils/string_ppi/string_ppi.py | 63 ++++++++++++++++-- .../9606.protein.links.full.v11.0.txt.gz | Bin 0 -> 345 bytes .../string/HUMAN_9606_idmapping.dat.gz | Bin 0 -> 241 bytes tests/resources/string/gene2ensembl.gz | Bin 0 -> 305 bytes tests/resources/string/gene_info.gz | Bin 0 -> 1371 bytes tests/test_string.py | 59 ++++++++++++++++ 6 files changed, 117 insertions(+), 5 deletions(-) create mode 100644 tests/resources/string/9606.protein.links.full.v11.0.txt.gz create mode 100644 tests/resources/string/HUMAN_9606_idmapping.dat.gz create mode 100644 tests/resources/string/gene2ensembl.gz create mode 100644 tests/resources/string/gene_info.gz create mode 100644 tests/test_string.py diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index 0585d240..3c08e5fe 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -1,12 +1,12 @@ import gzip +import logging import os import compress_json # type: ignore from typing import Dict, List, Any, Set, Optional from kg_covid_19.transform_utils.transform import Transform -from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority - -from encodeproject import download as encode_download # type: ignore +from kg_covid_19.utils.transform_utils import write_node_edge_item, \ + get_item_by_priority, uniprot_make_name_to_id_mapping """ Ingest protein-protein interactions from STRING DB. @@ -24,12 +24,20 @@ Edge: subject edge_label object relation protein:1234 interacts_with protein:4567 RO:0002434 + +Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so: +subject edge_label object relation +protein:1234 bl:xrefs protein:4567 RO:0002434 + """ NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/' PROTEIN_MAPPING_FILE = 'gene2ensembl.gz' GENE_INFO_FILE = 'gene_info.gz' +# make name to id map for uniprot names of human proteins +UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz" + class StringTransform(Transform): """ @@ -43,7 +51,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None): self.protein_gene_map: Dict[str, Any] = {} self.gene_info_map: Dict[str, Any] = {} self.ensembl2ncbi_map: Dict[str, Any] = {} + logging.info("Loading Ensembl Gene to Protein mapping") self.load_mapping(self.input_base_dir, self.output_dir, ['9606']) + logging.info("Load mappings from NCBI gene_info") self.load_gene_info(self.input_base_dir, self.output_dir, ['9606']) def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None: @@ -143,6 +153,10 @@ def run(self, data_file: Optional[str] = None) -> None: # with the default header extra_header = [""]*(len(edge_additional_headers)+1) + # make string ENSP to Uniprot id mapping dict + string_to_uniprot_id_map = uniprot_make_name_to_id_mapping( + os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING)) + with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: @@ -155,9 +169,25 @@ def run(self, data_file: Optional[str] = None) -> None: items_dict = parse_stringdb_interactions(line, header_items) proteins = [] for protein_name in ('protein1', 'protein2'): - protein = get_item_by_priority(items_dict, [protein_name]) - protein = '.'.join(protein.split('.')[1:]) + nat_string_id = get_item_by_priority(items_dict, [protein_name]) + protein = '.'.join(nat_string_id.split('.')[1:]) proteins.append(protein) + + if nat_string_id in string_to_uniprot_id_map: + make_xref_node_and_edge_entries(protein, + string_to_uniprot_id_map) + + write_node_edge_item( + fh=edge, + header=self.edge_header, + data=[f"ENSEMBL:{protein}", + "biolink:xrefs", + f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}", + "biolink:xrefs", + "uniprot", + ] + extra_header + ) + if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: @@ -240,3 +270,26 @@ def parse_header(header_string: str, sep: str = ' ') -> List: header = header_string.strip().split(sep) return [i.replace('"', '') for i in header] + + +def make_xref_node_and_edge_entries(protein_id: str, + string_to_uniprot_id_map: dict, + ensembl_prefix: str = "ENSEMBL", + xref_edge_label: str = "biolink:xrefs", + uniprot_prefix: str = "UniprotKB", + relation_term: str = "biolink:xrefs", + source_field: str = "uniprot"): + """Given a STRING-style ENSEMBL protein ID and a map to Uniprot ID map, make a new + node for the Uniprot ID, and an edge asserting a xref between the ENSEMBL ID and the + Uniprot ID + + :param protein_id: + :param string_to_uniprot_id_map: + :param ensembl_prefix: + :param xref_edge_label: + :param uniprot_prefix: + :param relation_term: + :param source_field: + :return: None + """ + pass diff --git a/tests/resources/string/9606.protein.links.full.v11.0.txt.gz b/tests/resources/string/9606.protein.links.full.v11.0.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..e6bd710e04e250fc58a38602f85a8cec280f8e56 GIT binary patch literal 345 zcmV-f0jB;RiwFo>ZRuVB135M@HZE{-Z**m8ZZ2$TZfkQcW_4_AE_N|7E-)^1cys`b zl1q-mAPk20Jq0Hifz9&(vmI5M15|0^wn!6*NGf%1KQxcFsIwqm@MG)`+hIs?A|C`B z76B}fKVOb9U1IEt&0$JS7*CWEb?kHBBpDm=Gct%5XI?4x2gBRhx3apBHGv=eW30zwlUGpX|6v$zrOc;{Q!}b zPA~w@GCmt3%&WdH!iv=`Kr!if6;7e#Ti(-z)vHpP31@3inIoloN9h->Top-JCrXkQ zbNbS_lJ|adN(~Ib*rG%27Mj_nXk9MSR28Q6bNF*Zav=-cZpg<_2LlA9W!dHYAI-+f rvj;A!nskW9{WGygT@v&7s-6Uz{C@DAwHbh9Ao=bOqn8>Rngaj;hv1%9 literal 0 HcmV?d00001 diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz new file mode 100644 index 0000000000000000000000000000000000000000..b1abe29521e7f26e49c271a258b9ff1fbfabe214 GIT binary patch literal 241 zcmV9U5JlJN zzX&s#nM|^5eHI@Sp{T1OQo)54#Q(>lSW_QaUCz15Bq<03J5x5N;(EEwvq3&G$V6E+ zE0^giDV}~>H+8kzpKG(*%+jpGxnThf93n!hPMm^`m{WG_td9^9iv)-XP}>m|7eR|; zJ+uEY0KW%lgw!LTa@T6#xUP=~@J`iJ^&0`Y@Y4%)q3;Ly_{aVAV9Fwgb|DQF`K9fr rmN4{_`1HSf&3KtD%vO}%w|Ss+Zi7|SK3o8JwUXQc2y;C#-T?pr9W|KHTW7n9vQau~N?(uS1VKLY>& DxQdVO literal 0 HcmV?d00001 diff --git a/tests/resources/string/gene_info.gz b/tests/resources/string/gene_info.gz new file mode 100644 index 0000000000000000000000000000000000000000..4636faf52c3a3857a92b12f88c1440598f3e67c1 GIT binary patch literal 1371 zcmV-h1*G~PiwFq3Y3W`717~G!WnXD-W^VwsmfLdTI1q-fHBX_cy_uAtQIIoq!|E|Qtal!AT%UTbkEWx~qv>k#6&dYk!9tluIG0!zX)3cEj+=xl z1-`J{Vhw3=d!z7HW;nn)KP+U9xu_oD_x%T%^K8!_`|1NNO@|V$>wQNll83cRnK2b{*418B6;FrFX2j7HgBV#%Td?`9BPULEMlu0;Ocg zA;<`)namj%s5u%T5Er{st)5A!f1*VyE!t{X|DzT~F)RA5l>KOH0A}!$9^xd>xUo8j zd><(G2Je>kK4`r=t*v$DJ!1Pqlu0O(w@KJqV%SQ!t?hFGuDdV;=Qs|s9nx)v#?K|oSe>3A;Nu-1!2xI{_xl;_-;0A;;6#2 zsHMr77C|oBZ!z&$2ITDH`oA_!pU5@*AW3&1r)OC7DCzC@^ zNgA`Cm=L@T5*{Be@D~FXx$h7GeIEKz>373!(Wh}wrhBUHnQfqHpA5#30vyWzU|f#-gYx6o<%-&#hbFg^D`NFr&n!P` z7}bySq=uUMAvW|ZfmZYXCf22Gp&(B8fkBJ`-o=Md{-R?m@n4wEjt>UL>HYHK=+0Og z#I@3X$x=ecQm<+A5mMuWk$eCHH{*MY%vdcI!5;6DWn(2;ct)JTF5-W0Li=>5VO1<>y{ zoORzXhgLPs^KbLI$UCi?0x(Vr;0VJhk0T~E5543zJS-j0oYX|-=rQBodB)U%5wFM? zr-~JktoBo|{#6%^%q#3?0E|Joi|}APz&@x=Y}!NcN#Q*(HG+`khZr_^LT&3e=GlT?DR*1Kgl2~r7D&jV)1vIW zi{HOE>_g`$z4Zi#&BzF None: + cls.input_dir = "tests/resources/string/" + cls.output_dir = tempfile.gettempdir() + cls.string_output_dir = os.path.join(cls.output_dir, "STRING") + cls.string = StringTransform(cls.input_dir, cls.output_dir) + + def setUp(self) -> None: + pass + + @parameterized.expand([ + ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1], + ['gene_info_map', dict, '1', + {'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG', + 'description': 'alpha-1-B glycoprotein'}], + ['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'], + ]) + def test_instance_vars(self, variable, type, key, val): + this_var = getattr(self.string, variable) + self.assertTrue(isinstance(this_var, type)) + self.assertTrue(key in this_var) + self.assertTrue(this_var[key], val) + + def test_output_dir(self): + self.assertEqual(self.string.output_dir, self.string_output_dir) + + def test_input_dir(self): + self.assertEqual(self.string.input_base_dir, self.input_dir) + + def test_output_edge_file(self): + self.assertEqual(self.string.output_edge_file, + os.path.join(self.string_output_dir, "edges.tsv")) + + def test_output_node_file(self): + self.assertEqual(self.string.output_node_file, + os.path.join(self.string_output_dir, "nodes.tsv")) + + def test_source_name(self): + self.assertEqual(self.string.source_name, 'STRING') + + def test_run(self): + self.assertTrue(isinstance(self.string.run, object)) + self.string.run() + self.assertTrue(os.path.isdir(self.string_output_dir)) + self.assertTrue( + os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv"))) + self.assertTrue( + os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv"))) From 88cf6ad0b1d49470bce91c7110593a9a3f4b361c Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 19:55:59 -0700 Subject: [PATCH 03/10] Fixed test, simplified code to add nodes/edges for xref to Uniprot IDs --- .../transform_utils/string_ppi/string_ppi.py | 63 +++++++----------- tests/resources/string/gene2ensembl.gz | Bin 305 -> 646 bytes 2 files changed, 23 insertions(+), 40 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index 3c08e5fe..b88a1e1e 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -2,7 +2,7 @@ import logging import os import compress_json # type: ignore -from typing import Dict, List, Any, Set, Optional +from typing import Dict, List, Any, Set, Optional, IO from kg_covid_19.transform_utils.transform import Transform from kg_covid_19.utils.transform_utils import write_node_edge_item, \ @@ -151,7 +151,7 @@ def run(self, data_file: Optional[str] = None) -> None: # Required to align the node edge header of the gene # with the default header - extra_header = [""]*(len(edge_additional_headers)+1) + self.extra_header = [""]*(len(edge_additional_headers)+1) # make string ENSP to Uniprot id mapping dict string_to_uniprot_id_map = uniprot_make_name_to_id_mapping( @@ -173,21 +173,6 @@ def run(self, data_file: Optional[str] = None) -> None: protein = '.'.join(nat_string_id.split('.')[1:]) proteins.append(protein) - if nat_string_id in string_to_uniprot_id_map: - make_xref_node_and_edge_entries(protein, - string_to_uniprot_id_map) - - write_node_edge_item( - fh=edge, - header=self.edge_header, - data=[f"ENSEMBL:{protein}", - "biolink:xrefs", - f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}", - "biolink:xrefs", - "uniprot", - ] + extra_header - ) - if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: @@ -215,7 +200,7 @@ def run(self, data_file: Optional[str] = None) -> None: f"ENSEMBL:{protein}", "RO:0002205", "NCBI", - ] + extra_header + ] + self.extra_header ) # write node data @@ -239,6 +224,26 @@ def run(self, data_file: Optional[str] = None) -> None: ] ) + # if we have an equivalent Uniprot ID for this Ensembl protein ID + # make an xref edge, and a node for the Uniprot ID + if nat_string_id in string_to_uniprot_id_map: + uniprot_curie = \ + f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" + write_node_edge_item( + fh=node, + header=self.node_header, + data=[uniprot_curie, "", + protein_node_type, "", "", self.source_name]) + write_node_edge_item( + fh=edge, + header=self.edge_header, + data=[f"ENSEMBL:{protein}", + "biolink:xrefs", + uniprot_curie, + "biolink:xrefs", + "uniprot", + ] + self.extra_header) + def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict: """Methods processes a line of text from Drug Central. @@ -271,25 +276,3 @@ def parse_header(header_string: str, sep: str = ' ') -> List: return [i.replace('"', '') for i in header] - -def make_xref_node_and_edge_entries(protein_id: str, - string_to_uniprot_id_map: dict, - ensembl_prefix: str = "ENSEMBL", - xref_edge_label: str = "biolink:xrefs", - uniprot_prefix: str = "UniprotKB", - relation_term: str = "biolink:xrefs", - source_field: str = "uniprot"): - """Given a STRING-style ENSEMBL protein ID and a map to Uniprot ID map, make a new - node for the Uniprot ID, and an edge asserting a xref between the ENSEMBL ID and the - Uniprot ID - - :param protein_id: - :param string_to_uniprot_id_map: - :param ensembl_prefix: - :param xref_edge_label: - :param uniprot_prefix: - :param relation_term: - :param source_field: - :return: None - """ - pass diff --git a/tests/resources/string/gene2ensembl.gz b/tests/resources/string/gene2ensembl.gz index 94cf7dcc53a713bdd6be62269a75f97cf9397cf8..57434c4fa1521fc0bdba5fb2d39b438bd0a50bec 100644 GIT binary patch literal 646 zcmV;10(t!(iwFoDgXvxX17~G!Win-Mb7gH}YyibpOK#gR5M4W`@C7W)|8sz#s|G>a zeeQpY4o%u3<)X+=+F7LEhx2BrG_VU^-amf-fc7AvAY49vej$U0ZPM2M+}Fq$0k*h& ze155e0d3S)Kj&3?%LYjkq-$h0$ontZY=>lw27}lz?{9SD9g~TZa`TSKJ0@wpD9m49 zAb?m1JEjT?$flRu<<|e0vltYLA;yf#%`OJqF}ECAOnK1)P8gf~7ZTCh@A=de1ImMT zR77PH?s_|L{h1s#{08t4^zex(&G`hZ3$Q@K_kg8WasX3d+X`%};$*>Z0WSekQA{tY z2{=~8T%I1lmBSsm3SO@sA=3FEC`>$y=#e2?a@kjTV9vozLmrq2p;e0$G(4bRa@b}m8nh*G_T!7OZ0)Mfp7&5o=k0D zWgAzfu}_~&0ebKxbt*iVyh3aTPfnCoPnwyW<;XPl`3atgi+b@?#~Ymqt5Yji2TxkV z1Q({3M0oMk_t_bqpi@m zZD_tN+A#In z^#oGq+nH(_zVwZT6T_GggNa@3$Rwq*&5SeZRF{}8o?6`RGaCp?*2~Q1`s|Q-W`nu4 zD|lrZ`}hP;8~|&bEGW7s5G6X+Nx0&As8ik2FHCLn*DZQRoyag&PntEh2UFeP?V}Or gehLi6&1X;bjhN!T%m{X*_Ko-U7d)UTjRgw;08KG9c>n+a literal 305 zcmV-10nYv(iwFohY3W`717~G!Win-Mb7gH}YygeYJ8r`;3W|KHTW7n9vQau~N?(uS1VKLY>& DxQdVO From 134a62e3838bca8a5c0e2139e249f78a04ce7c74 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 20:20:35 -0700 Subject: [PATCH 04/10] Moved code to write/edges for xref to Uniprot IDs --- .../transform_utils/string_ppi/string_ppi.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index b88a1e1e..17dddd49 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -213,6 +213,26 @@ def run(self, data_file: Optional[str] = None) -> None: protein_node_type, "", "", self.source_name] ) + # if we have an equivalent Uniprot ID for this Ensembl protein + # ID make an xref edge, and a node for the Uniprot ID + if nat_string_id in string_to_uniprot_id_map: + uniprot_curie = \ + f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" + write_node_edge_item( + fh=node, + header=self.node_header, + data=[uniprot_curie, "", + protein_node_type, "", "", self.source_name]) + write_node_edge_item( + fh=edge, + header=self.edge_header, + data=[f"ENSEMBL:{protein}", + "biolink:xrefs", + uniprot_curie, + "biolink:xrefs", + "uniprot", + ] + self.extra_header) + # write edge data write_node_edge_item( fh=edge, @@ -224,25 +244,6 @@ def run(self, data_file: Optional[str] = None) -> None: ] ) - # if we have an equivalent Uniprot ID for this Ensembl protein ID - # make an xref edge, and a node for the Uniprot ID - if nat_string_id in string_to_uniprot_id_map: - uniprot_curie = \ - f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" - write_node_edge_item( - fh=node, - header=self.node_header, - data=[uniprot_curie, "", - protein_node_type, "", "", self.source_name]) - write_node_edge_item( - fh=edge, - header=self.edge_header, - data=[f"ENSEMBL:{protein}", - "biolink:xrefs", - uniprot_curie, - "biolink:xrefs", - "uniprot", - ] + self.extra_header) def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict: From 4afc52946d338b20966458e402b64b1fc4c2c775 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 20:38:40 -0700 Subject: [PATCH 05/10] Moved code to write/edges for xref to Uniprot IDs --- .../transform_utils/string_ppi/string_ppi.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index 17dddd49..987d500a 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -213,25 +213,25 @@ def run(self, data_file: Optional[str] = None) -> None: protein_node_type, "", "", self.source_name] ) - # if we have an equivalent Uniprot ID for this Ensembl protein - # ID make an xref edge, and a node for the Uniprot ID - if nat_string_id in string_to_uniprot_id_map: - uniprot_curie = \ - f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" - write_node_edge_item( - fh=node, - header=self.node_header, - data=[uniprot_curie, "", - protein_node_type, "", "", self.source_name]) - write_node_edge_item( - fh=edge, - header=self.edge_header, - data=[f"ENSEMBL:{protein}", - "biolink:xrefs", - uniprot_curie, - "biolink:xrefs", - "uniprot", - ] + self.extra_header) + # if we have an equivalent Uniprot ID for this Ensembl protein + # ID make an xref edge, and a node for the Uniprot ID + if nat_string_id in string_to_uniprot_id_map: + uniprot_curie = \ + f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" + write_node_edge_item( + fh=node, + header=self.node_header, + data=[uniprot_curie, "", + protein_node_type, "", "", self.source_name]) + write_node_edge_item( + fh=edge, + header=self.edge_header, + data=[f"ENSEMBL:{protein}", + "biolink:xrefs", + uniprot_curie, + "biolink:xrefs", + "uniprot", + ] + self.extra_header) # write edge data write_node_edge_item( From 44e7df1527f0d5a4e9fb4b35cc5037a2a909dc2a Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 20:55:15 -0700 Subject: [PATCH 06/10] Moved code to write protein nodes out of block that write gene -> protein edges - I think that's a bug --- .../transform_utils/string_ppi/string_ppi.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index 987d500a..c199cab9 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -203,35 +203,35 @@ def run(self, data_file: Optional[str] = None) -> None: ] + self.extra_header ) - # write node data - if protein not in seen_proteins: - seen_proteins.add(protein) - write_node_edge_item( - fh=node, - header=self.node_header, - data=[f"ENSEMBL:{protein}", "", - protein_node_type, "", "", self.source_name] - ) - - # if we have an equivalent Uniprot ID for this Ensembl protein - # ID make an xref edge, and a node for the Uniprot ID - if nat_string_id in string_to_uniprot_id_map: - uniprot_curie = \ - f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}" + # write node data + if protein not in seen_proteins: + seen_proteins.add(protein) write_node_edge_item( fh=node, header=self.node_header, - data=[uniprot_curie, "", - protein_node_type, "", "", self.source_name]) - write_node_edge_item( - fh=edge, - header=self.edge_header, - data=[f"ENSEMBL:{protein}", - "biolink:xrefs", - uniprot_curie, - "biolink:xrefs", - "uniprot", - ] + self.extra_header) + data=[f"ENSEMBL:{protein}", "", + protein_node_type, "", "", self.source_name] + ) + + # if we have an equivalent Uniprot ID for this Ensembl protein + # ID make an xref edge, and a node for the Uniprot ID + if protein in string_to_uniprot_id_map: + uniprot_curie = \ + f"UniprotKB:{string_to_uniprot_id_map[protein]}" + write_node_edge_item( + fh=node, + header=self.node_header, + data=[uniprot_curie, "", + protein_node_type, "", "", self.source_name]) + write_node_edge_item( + fh=edge, + header=self.edge_header, + data=[f"ENSEMBL:{protein}", + "biolink:xrefs", + uniprot_curie, + "biolink:xrefs", + "uniprot", + ] + self.extra_header) # write edge data write_node_edge_item( From d7f3d9d48edf4a9fa9d9b4b80779730777736d40 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 21:38:08 -0700 Subject: [PATCH 07/10] Better tests for nodes and edges tsv files in STRING --- .../transform_utils/string_ppi/string_ppi.py | 1 - tests/test_string.py | 46 ++++++++++++++----- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index c199cab9..bc321003 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -245,7 +245,6 @@ def run(self, data_file: Optional[str] = None) -> None: ) - def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict: """Methods processes a line of text from Drug Central. diff --git a/tests/test_string.py b/tests/test_string.py index d7ccbf5d..d271b299 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -1,5 +1,6 @@ import os import tempfile +import pandas as pd from unittest import TestCase, skip from parameterized import parameterized @@ -9,15 +10,11 @@ class TestString(TestCase): """Tests the string ingest""" - @classmethod - def setUpClass(cls) -> None: - cls.input_dir = "tests/resources/string/" - cls.output_dir = tempfile.gettempdir() - cls.string_output_dir = os.path.join(cls.output_dir, "STRING") - cls.string = StringTransform(cls.input_dir, cls.output_dir) - def setUp(self) -> None: - pass + self.input_dir = "tests/resources/string/" + self.output_dir = tempfile.gettempdir() + self.string_output_dir = os.path.join(self.output_dir, "STRING") + self.string = StringTransform(self.input_dir, self.output_dir) @parameterized.expand([ ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1], @@ -53,7 +50,32 @@ def test_run(self): self.assertTrue(isinstance(self.string.run, object)) self.string.run() self.assertTrue(os.path.isdir(self.string_output_dir)) - self.assertTrue( - os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv"))) - self.assertTrue( - os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv"))) + + def test_nodes_file(self): + self.string.run() + node_file = os.path.join(self.string_output_dir, "nodes.tsv") + self.assertTrue(os.path.isfile(node_file)) + node_df = pd.read_csv(node_file, sep="\t", header=0) + self.assertEqual((10, 6), node_df.shape) + self.assertEqual(['id', 'name', 'category', 'description', 'alias', + 'provided_by'], list(node_df.columns)) + self.assertListEqual(['ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298', + 'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445', + 'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801', + 'ENSEMBL:ENSP00000466298', 'ENSEMBL:ENSP00000232564', + 'ENSEMBL:ENSP00000393379', 'ENSEMBL:ENSP00000371253'], + list(node_df.id.unique())) + + def test_edges_file(self): + self.string.run() + edge_file = os.path.join(self.string_output_dir, "edges.tsv") + self.assertTrue(os.path.isfile(edge_file)) + edge_df = pd.read_csv(edge_file, sep="\t", header=0) + self.assertEqual((9, 19), edge_df.shape) + self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by', + 'combined_score', 'neighborhood', 'neighborhood_transferred', + 'fusion', 'cooccurence', 'homology', 'coexpression', + 'coexpression_transferred', 'experiments', + 'experiments_transferred', 'database', 'database_transferred', + 'textmining', 'textmining_transferred', ], + list(edge_df.columns)) From d6a052f4e2cd0a5f99588688b848d3ed71d751c8 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 21:48:29 -0700 Subject: [PATCH 08/10] Added some ids to test ids make sure we're getting Ensembl to Uniprot mapping stuff --- .../string/HUMAN_9606_idmapping.dat.gz | Bin 241 -> 279 bytes tests/test_string.py | 15 ++++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz index b1abe29521e7f26e49c271a258b9ff1fbfabe214..d6ac44b2e39092c9501f6c2365a48a8ec7cb7d48 100644 GIT binary patch delta 258 zcmV+d0sa2*0ha;>ABzYG1)z}zAAiA4!!Qhn&)K^S*@+z|$99yh+E${D3OH2*1qq22 zY4HA{+E|y4;gj`!e_|)WpeL)E?mg^3znA&6yj4iXWRgi zw>h9}sl67IFQ=5E6?7hzU^J5fv9fi)1~s|1ki+2WW)UBcO8EYE9p`u8#=t zPSsQO8v(lT(+hN=?+5t!$NlwS$|8q$Aq^GzrR}GdF!Yo7^uK$}c$qECR+Qeid7yM| WgH_Z%TmX2rlH36Zb3HNM0RRAvCT6Sv diff --git a/tests/test_string.py b/tests/test_string.py index d271b299..c7a84122 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -56,14 +56,15 @@ def test_nodes_file(self): node_file = os.path.join(self.string_output_dir, "nodes.tsv") self.assertTrue(os.path.isfile(node_file)) node_df = pd.read_csv(node_file, sep="\t", header=0) - self.assertEqual((10, 6), node_df.shape) + self.assertEqual((11, 6), node_df.shape) self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'provided_by'], list(node_df.columns)) - self.assertListEqual(['ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298', - 'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445', - 'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801', - 'ENSEMBL:ENSP00000466298', 'ENSEMBL:ENSP00000232564', - 'ENSEMBL:ENSP00000393379', 'ENSEMBL:ENSP00000371253'], + self.assertListEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233', + 'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401', + 'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915', + 'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298', + 'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379', + 'ENSEMBL:ENSP00000371253'], list(node_df.id.unique())) def test_edges_file(self): @@ -71,7 +72,7 @@ def test_edges_file(self): edge_file = os.path.join(self.string_output_dir, "edges.tsv") self.assertTrue(os.path.isfile(edge_file)) edge_df = pd.read_csv(edge_file, sep="\t", header=0) - self.assertEqual((9, 19), edge_df.shape) + self.assertEqual((10, 19), edge_df.shape) self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by', 'combined_score', 'neighborhood', 'neighborhood_transferred', 'fusion', 'cooccurence', 'homology', 'coexpression', From e3887192d0d373c973d50f6b64b1e35a88dcc5d8 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Tue, 16 Jun 2020 21:58:11 -0700 Subject: [PATCH 09/10] Test failing for trivial reason --- tests/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_string.py b/tests/test_string.py index c7a84122..d29e2f4b 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -59,7 +59,7 @@ def test_nodes_file(self): self.assertEqual((11, 6), node_df.shape) self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'provided_by'], list(node_df.columns)) - self.assertListEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233', + self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298', From cf1672d984ef16b8a82f5d210c078b9d6acc8141 Mon Sep 17 00:00:00 2001 From: Justin Reese Date: Wed, 17 Jun 2020 16:48:14 -0700 Subject: [PATCH 10/10] Refactor such that Uniprot IDs are node properties in xrefs column (instead of edges) --- .../string_ppi/node_header.json | 3 +- .../transform_utils/string_ppi/string_ppi.py | 43 ++++++++----------- tests/test_string.py | 11 +++-- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/kg_covid_19/transform_utils/string_ppi/node_header.json b/kg_covid_19/transform_utils/string_ppi/node_header.json index c2b99f0d..d761756f 100644 --- a/kg_covid_19/transform_utils/string_ppi/node_header.json +++ b/kg_covid_19/transform_utils/string_ppi/node_header.json @@ -4,5 +4,6 @@ "category", "description", "alias", + "xrefs", "provided_by" -] \ No newline at end of file +] diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py index bc321003..4cc39261 100644 --- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py +++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py @@ -18,16 +18,15 @@ Write node and edge headers that look something like: Node: -id name category -protein:1234 TBX4 Protein +id name category xrefs provided_by +protein:1234 TBX4 biolink:Protein UniprotKB:123456 STRING + +xrefs contains the UniprotKB id for the protein, if available Edge: subject edge_label object relation protein:1234 interacts_with protein:4567 RO:0002434 -Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so: -subject edge_label object relation -protein:1234 bl:xrefs protein:4567 RO:0002434 """ @@ -188,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None: 'biolink:Gene', gene_informations['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene]}", + "", self.source_name ] ) @@ -206,32 +206,25 @@ def run(self, data_file: Optional[str] = None) -> None: # write node data if protein not in seen_proteins: seen_proteins.add(protein) - write_node_edge_item( - fh=node, - header=self.node_header, - data=[f"ENSEMBL:{protein}", "", - protein_node_type, "", "", self.source_name] - ) # if we have an equivalent Uniprot ID for this Ensembl protein # ID make an xref edge, and a node for the Uniprot ID + uniprot_curie = '' if protein in string_to_uniprot_id_map: uniprot_curie = \ f"UniprotKB:{string_to_uniprot_id_map[protein]}" - write_node_edge_item( - fh=node, - header=self.node_header, - data=[uniprot_curie, "", - protein_node_type, "", "", self.source_name]) - write_node_edge_item( - fh=edge, - header=self.edge_header, - data=[f"ENSEMBL:{protein}", - "biolink:xrefs", - uniprot_curie, - "biolink:xrefs", - "uniprot", - ] + self.extra_header) + + write_node_edge_item( + fh=node, + header=self.node_header, + data=[f"ENSEMBL:{protein}", "", + protein_node_type, + "", + "", + uniprot_curie, # xref + self.source_name] + ) + # write edge data write_node_edge_item( diff --git a/tests/test_string.py b/tests/test_string.py index d29e2f4b..5588db80 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -56,23 +56,26 @@ def test_nodes_file(self): node_file = os.path.join(self.string_output_dir, "nodes.tsv") self.assertTrue(os.path.isfile(node_file)) node_df = pd.read_csv(node_file, sep="\t", header=0) - self.assertEqual((11, 6), node_df.shape) - self.assertEqual(['id', 'name', 'category', 'description', 'alias', + self.assertEqual((10, 7), node_df.shape) + self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs', 'provided_by'], list(node_df.columns)) - self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233', + self.assertCountEqual(['ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298', 'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379', 'ENSEMBL:ENSP00000371253'], list(node_df.id.unique())) + self.assertCountEqual('UniprotKB:P84085', + node_df.loc[node_df['id'] == + 'ENSEMBL:ENSP00000000233'].xrefs.item()) def test_edges_file(self): self.string.run() edge_file = os.path.join(self.string_output_dir, "edges.tsv") self.assertTrue(os.path.isfile(edge_file)) edge_df = pd.read_csv(edge_file, sep="\t", header=0) - self.assertEqual((10, 19), edge_df.shape) + self.assertEqual((9, 19), edge_df.shape) self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by', 'combined_score', 'neighborhood', 'neighborhood_transferred', 'fusion', 'cooccurence', 'homology', 'coexpression',