From b287df50e5d2ddc8a8134a149383103b6bc6361b Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 15:48:27 -0700
Subject: [PATCH 01/10] Added item in STRING section of download.yaml for
 HUMAN_9606_idmapping.dat.gz

---
 download.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/download.yaml b/download.yaml
index e9a948bd..0d8ea12f 100644
--- a/download.yaml
+++ b/download.yaml
@@ -54,6 +54,13 @@
   url: https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
   local_name: gene_info.gz
 
+-
+  # this is to make bl:xrefs from ENSP protein ids to UniprotKB ids (issue #235)
+  # to be ID mapped in merge step
+  # nb: we are also downloading and using this file for the TTD transform
+  url: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping.dat.gz
+  local_name: HUMAN_9606_idmapping.dat.gz
+
 #
 # TTD - Therapeutic Targets Database
 # drug targets, and associated data for each (drugs, ids, etc)

From 651a37e5dcad2dccb18bbaf0a10e5ba5c75adbd7 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 18:40:38 -0700
Subject: [PATCH 02/10] Some tests for STRING

---
 .../transform_utils/string_ppi/string_ppi.py  |  63 ++++++++++++++++--
 .../9606.protein.links.full.v11.0.txt.gz      | Bin 0 -> 345 bytes
 .../string/HUMAN_9606_idmapping.dat.gz        | Bin 0 -> 241 bytes
 tests/resources/string/gene2ensembl.gz        | Bin 0 -> 305 bytes
 tests/resources/string/gene_info.gz           | Bin 0 -> 1371 bytes
 tests/test_string.py                          |  59 ++++++++++++++++
 6 files changed, 117 insertions(+), 5 deletions(-)
 create mode 100644 tests/resources/string/9606.protein.links.full.v11.0.txt.gz
 create mode 100644 tests/resources/string/HUMAN_9606_idmapping.dat.gz
 create mode 100644 tests/resources/string/gene2ensembl.gz
 create mode 100644 tests/resources/string/gene_info.gz
 create mode 100644 tests/test_string.py

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index 0585d240..3c08e5fe 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -1,12 +1,12 @@
 import gzip
+import logging
 import os
 import compress_json  # type: ignore
 from typing import Dict, List, Any, Set, Optional
 
 from kg_covid_19.transform_utils.transform import Transform
-from kg_covid_19.utils.transform_utils import write_node_edge_item, get_item_by_priority
-
-from encodeproject import download as encode_download  # type: ignore
+from kg_covid_19.utils.transform_utils import write_node_edge_item, \
+    get_item_by_priority, uniprot_make_name_to_id_mapping
 
 """
 Ingest protein-protein interactions from STRING DB.
@@ -24,12 +24,20 @@
 Edge: 
 subject edge_label  object  relation
 protein:1234    interacts_with  protein:4567    RO:0002434
+
+Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
+subject edge_label  object  relation
+protein:1234    bl:xrefs  protein:4567    RO:0002434
+
 """
 
 NCBI_FTP_URL = 'https://ftp.ncbi.nlm.nih.gov/gene/DATA/'
 PROTEIN_MAPPING_FILE = 'gene2ensembl.gz'
 GENE_INFO_FILE = 'gene_info.gz'
 
+# make name to id map for uniprot names of human proteins
+UNIPROT_ID_MAPPING = "HUMAN_9606_idmapping.dat.gz"
+
 
 class StringTransform(Transform):
     """
@@ -43,7 +51,9 @@ def __init__(self, input_dir: str = None, output_dir: str = None):
         self.protein_gene_map: Dict[str, Any] = {}
         self.gene_info_map: Dict[str, Any] = {}
         self.ensembl2ncbi_map: Dict[str, Any] = {}
+        logging.info("Loading Ensembl Gene to Protein mapping")
         self.load_mapping(self.input_base_dir, self.output_dir, ['9606'])
+        logging.info("Load mappings from NCBI gene_info")
         self.load_gene_info(self.input_base_dir, self.output_dir, ['9606'])
 
     def load_mapping(self, input_dir: str, output_dir: str, species_id: List = None) -> None:
@@ -143,6 +153,10 @@ def run(self, data_file: Optional[str] = None) -> None:
         # with the default header
         extra_header = [""]*(len(edge_additional_headers)+1)
 
+        # make string ENSP to Uniprot id mapping dict
+        string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
+            os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING))
+
         with open(self.output_node_file, 'w') as node, \
                 open(self.output_edge_file, 'w') as edge, \
                 gzip.open(data_file, 'rt') as interactions:
@@ -155,9 +169,25 @@ def run(self, data_file: Optional[str] = None) -> None:
                 items_dict = parse_stringdb_interactions(line, header_items)
                 proteins = []
                 for protein_name in ('protein1', 'protein2'):
-                    protein = get_item_by_priority(items_dict, [protein_name])
-                    protein = '.'.join(protein.split('.')[1:])
+                    nat_string_id = get_item_by_priority(items_dict, [protein_name])
+                    protein = '.'.join(nat_string_id.split('.')[1:])
                     proteins.append(protein)
+
+                    if nat_string_id in string_to_uniprot_id_map:
+                        make_xref_node_and_edge_entries(protein,
+                                                        string_to_uniprot_id_map)
+
+                        write_node_edge_item(
+                            fh=edge,
+                            header=self.edge_header,
+                            data=[f"ENSEMBL:{protein}",
+                                  "biolink:xrefs",
+                                  f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}",
+                                  "biolink:xrefs",
+                                  "uniprot",
+                                  ] + extra_header
+                        )
+
                     if protein in self.protein_gene_map:
                         gene = self.protein_gene_map[protein]
                         if gene not in seen_genes:
@@ -240,3 +270,26 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
     header = header_string.strip().split(sep)
 
     return [i.replace('"', '') for i in header]
+
+
+def make_xref_node_and_edge_entries(protein_id: str,
+                                    string_to_uniprot_id_map: dict,
+                                    ensembl_prefix: str = "ENSEMBL",
+                                    xref_edge_label: str = "biolink:xrefs",
+                                    uniprot_prefix: str = "UniprotKB",
+                                    relation_term: str = "biolink:xrefs",
+                                    source_field: str = "uniprot"):
+    """Given a STRING-style ENSEMBL protein ID and a map to Uniprot ID map, make a new
+    node for the Uniprot ID, and an edge asserting a xref between the ENSEMBL ID and the
+    Uniprot ID
+
+    :param protein_id:
+    :param string_to_uniprot_id_map:
+    :param ensembl_prefix:
+    :param xref_edge_label:
+    :param uniprot_prefix:
+    :param relation_term:
+    :param source_field:
+    :return: None
+    """
+    pass
diff --git a/tests/resources/string/9606.protein.links.full.v11.0.txt.gz b/tests/resources/string/9606.protein.links.full.v11.0.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..e6bd710e04e250fc58a38602f85a8cec280f8e56
GIT binary patch
literal 345
zcmV-f0jB;RiwFo>ZRuVB135M@HZE{-Z**m8ZZ2$TZfkQcW_4_AE_N|7E-)^1cys`b
zl1q-mAPk20Jq0Hifz9&(vmI5M15|0^wn!6*NGf%1KQxcFsIwqm@MG)`+hIs?A|C`B
z76B}fKVOb9U1IEt&0$JS7*CWEb?kH<eGIIPv2Cx3LQCut`}h;jzu8H*A<?)-JU0a)
zyD9me!Zg0>BBpDm=Gct%5XI?4x2gBRhx3apBHGv=eW30zwlUGpX|6v$zrOc;{Q!}b
zPA~w@GCmt3%&WdH!iv=`Kr!if6;7e#Ti(-z)vHpP31@3inIoloN9h->Top-JCrXkQ
zbNbS_lJ|adN(~Ib*rG%27Mj_nXk9MSR28Q6bNF*Zav=-cZpg<_2LlA9W!dHYAI-+f
rvj;A!nskW9{WGygT@v&7s-6Uz{C@DAwHbh9Ao=bOqn8>Rngaj;hv1%9

literal 0
HcmV?d00001

diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz
new file mode 100644
index 0000000000000000000000000000000000000000..b1abe29521e7f26e49c271a258b9ff1fbfabe214
GIT binary patch
literal 241
zcmV<N01p2jiwFq(aOqwE14va(K~7&eHZV3{X=H6-aByjEXD(!6bO3db%L>9U5JlJN
zzX&s#nM|^5eHI@Sp{T1OQo)54#Q(>lSW_QaUCz15Bq<03J5x5N;(EEwvq3&G$V6E+
zE0^giDV}~>H+8kzpKG(*%+jpGxnThf93n!hPMm^`m{WG_td9^9iv)-XP}>m|7eR|;
zJ+uEY0KW%lgw!LTa@T6#xUP=~@J`iJ^&0`Y@Y4%)q3;Ly_{aVAV9Fwgb|DQF`K9fr
rmN4{_`1HSf&3KtD%vO}%w|Ss+Zi7|SK3o8JwUXQc2y;C#-T?pr9<Xi{

literal 0
HcmV?d00001

diff --git a/tests/resources/string/gene2ensembl.gz b/tests/resources/string/gene2ensembl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..94cf7dcc53a713bdd6be62269a75f97cf9397cf8
GIT binary patch
literal 305
zcmV-10nYv(iwFohY3W`717~G!Win-Mb7gH}YygeYJ8r`;3<luYxl07TDITE6)F5cP
z=l-`SW!aUq6!vBj{`^y97}$o}$MgFE?Lk68*q)znWbkT5TJ2}Akud_4x;<ZSbugep
zt@@g)bc;sPM7q|9M(%G?Y><q#F^J9FPqgqbnK&qlcaw)nng?lqy+Hsmg<-0&fE3;C
z+r57na~Kq=M;J58Zblez$0%`RG36x-IAJaFFNtXFeZDot3fY5p6;auQd)^M`$K+7(
z0(=BI`^1pe`2_0%3j|z)rBCvJDQ#O}TN4KhUV@heQc(=gUrlgb6La=-!nub-&VlFA
zxkNf&1XUA{{L#5Xw(MoE@?p-48HRkA2vu`NV;7>W|KHTW7n9vQau~N?(uS1VKLY>&
DxQdVO

literal 0
HcmV?d00001

diff --git a/tests/resources/string/gene_info.gz b/tests/resources/string/gene_info.gz
new file mode 100644
index 0000000000000000000000000000000000000000..4636faf52c3a3857a92b12f88c1440598f3e67c1
GIT binary patch
literal 1371
zcmV-h1*G~PiwFq3Y3W`717~G!WnXD-W^VwsmfLdTI1q-fHBX_cy_u<sQ<i+fxdTF&
z)P_R=nbQqrYysM_C9h@g3@JbSwrt~&;K?LgwISeBtN-p#-Ln43gP&_2p)nKe_8Kh@
z$wtQLPKJeA1v|JFQXCS6qRp3#Z50alnM|aT2}4Pcu45SnIhO)OOobUwtMmMjvbEf<
zci^+}xZc9BwSZwFjDx($*g7ckz07!iKsQAkuSJls7p<nJw^a^Dg=YDlWoyvpJ5j}<
z&_qUj%j>AtQIIoq!|E|Qtal!AT%UTbkEWx~qv>k#6&dYk!9tluIG0!zX)3cEj+=xl
z1-`J{Vhw3=d!z7HW;nn)KP+U9xu_oD_x%T%^K8!_<c1ds4uc}*LPmj5ml)qGhJWmt
zz$(uIE-til9n%HoN<v<6!t(GEu5ck#%0kG4MHd*fs$r1VmSs%jSYyIL9OKpsOTpAJ
zj>`|1NNO@|V$>wQNll83cRnK2b{*418B6;FrFX2j7HgBV#%Td?`9BPULEMlu0;Ocg
zA;<`)namj%s5u%T5Er{st)5A!f1*VyE!t{X|DzT~F)RA5l>KOH0A}!$9^xd>xUo8j
zd><(G2Je>kK4`r=t*v$DJ!1Pqlu0O(w@KJqV%SQ!t?hFGuDdV;=Qs|s9n<Ah2%f(|
z%F$BSt(5l)DV0=p_hu;{@lA@-!F)+hYwdQEzI~2KTf4P8+S*pP-{8=t7WlM??^vFX
zC@C_-cLDiubT?WLZFlTl_WU}~Pt`$$i)sH^TT$Isp6@uwvOL!{(U0P4`<HKpMCW-t
z`*w3Xz4~@NFvg?l$S^GvWNR^^+u>x)v#?K|oSe>3A;Nu-1!2xI{_xl;_-;0A;;6#2
zsHMr77C|<JDW_;PoD=dzL#xj2C>oBZ!z&$2ITDH`oA_!pU5@*AW3&1r)OC7DCzC@^
zNgA`Cm=L@T5*{Be@D~FXx$h7GeIEKz>373!(W<pLra=fi|IXTLTDEIN-eFum(Wpk!
zC-*LJ&Q#8o(X5$a?Vi?@T;gd;j^9JJW82h3%kgpoWd7_?a}6A9*MkaK+^+^hf<9?p
zpSqyUSBnOk*>h}wrhBUHnQ<F!^|pfKcQ}zz0SyK#{29Ai|8+xWud^!`_fs5*2<s$p
zBolZG;ZTYkj@p#eEF4NSi>fqHpA5#30vyWzU|f#-gYx6o<%-&#hbFg^D`NFr&n!P`
z7}bySq=uUMAvW|ZfmZYXCf22Gp&(B8fkBJ`-o=Md{-R?m@n4wEjt>UL>HYHK=+0Og
z#I@3X$x<Hi=m2kUtXP;apm5Od%q+nhF0{mB*Kk~o0R+^9X0eV4DC*ORCO{|G<#67g
zTs!5>=ecQm<+A5mMuWk$eCHH{*MY%vdcI!5;6DWn(2;ct)JTF5-W0Li=>5VO1<>y{
zoORzXhgLPs^KbLI$UCi?0x(Vr;0VJhk0T~E5543zJS-j0oYX|-=rQBodB)U%5wFM?
zr-~JktoBo|{#6%^%q#3?0E|Joi|}APz&@x=<I(DDR$=?sUsXx;hoqdqMG0Sn=-=CB
zMthd+oX)6gch$B>Y}!NcN#Q*(HG+`khZr_^LT&3e=GlT?DR*1Kgl2~r7D&jV)1vIW
zi{HOE>_g`$z4Zi#&BzF<q%x^x5(&F-nWcLk;un3Nk%dZu1Zj<yR#a&@heJixg|(nZ
zpHg)@s6yd@uNSDEt(TQ(x8?7s(oR0~zf+~1RB6r1*1N`r713eOYgTj33M&4Q4RKx~
da8lh($5g(4W7@KK5?Xhs{0o!zoX#8$005Q;v5^1(

literal 0
HcmV?d00001

diff --git a/tests/test_string.py b/tests/test_string.py
new file mode 100644
index 00000000..d7ccbf5d
--- /dev/null
+++ b/tests/test_string.py
@@ -0,0 +1,59 @@
+import os
+import tempfile
+from unittest import TestCase, skip
+
+from parameterized import parameterized
+
+from kg_covid_19.transform_utils.string_ppi import StringTransform
+
+
+class TestString(TestCase):
+    """Tests the string ingest"""
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.input_dir = "tests/resources/string/"
+        cls.output_dir = tempfile.gettempdir()
+        cls.string_output_dir = os.path.join(cls.output_dir, "STRING")
+        cls.string = StringTransform(cls.input_dir, cls.output_dir)
+
+    def setUp(self) -> None:
+        pass
+
+    @parameterized.expand([
+    ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
+    ['gene_info_map', dict, '1',
+     {'ENSEMBL': 'ENSG00000121410', 'symbol': 'A1BG',
+      'description': 'alpha-1-B glycoprotein'}],
+    ['protein_gene_map', dict, 'ENSP00000263100', 'ENSG00000121410'],
+    ])
+    def test_instance_vars(self, variable, type, key, val):
+        this_var = getattr(self.string, variable)
+        self.assertTrue(isinstance(this_var, type))
+        self.assertTrue(key in this_var)
+        self.assertTrue(this_var[key], val)
+
+    def test_output_dir(self):
+        self.assertEqual(self.string.output_dir, self.string_output_dir)
+
+    def test_input_dir(self):
+        self.assertEqual(self.string.input_base_dir, self.input_dir)
+
+    def test_output_edge_file(self):
+        self.assertEqual(self.string.output_edge_file,
+                         os.path.join(self.string_output_dir, "edges.tsv"))
+
+    def test_output_node_file(self):
+        self.assertEqual(self.string.output_node_file,
+                         os.path.join(self.string_output_dir, "nodes.tsv"))
+
+    def test_source_name(self):
+        self.assertEqual(self.string.source_name, 'STRING')
+
+    def test_run(self):
+        self.assertTrue(isinstance(self.string.run, object))
+        self.string.run()
+        self.assertTrue(os.path.isdir(self.string_output_dir))
+        self.assertTrue(
+            os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv")))
+        self.assertTrue(
+            os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv")))

From 88cf6ad0b1d49470bce91c7110593a9a3f4b361c Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 19:55:59 -0700
Subject: [PATCH 03/10] Fixed test, simplified code to add nodes/edges for xref
 to Uniprot IDs

---
 .../transform_utils/string_ppi/string_ppi.py  |  63 +++++++-----------
 tests/resources/string/gene2ensembl.gz        | Bin 305 -> 646 bytes
 2 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index 3c08e5fe..b88a1e1e 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import compress_json  # type: ignore
-from typing import Dict, List, Any, Set, Optional
+from typing import Dict, List, Any, Set, Optional, IO
 
 from kg_covid_19.transform_utils.transform import Transform
 from kg_covid_19.utils.transform_utils import write_node_edge_item, \
@@ -151,7 +151,7 @@ def run(self, data_file: Optional[str] = None) -> None:
 
         # Required to align the node edge header of the gene
         # with the default header
-        extra_header = [""]*(len(edge_additional_headers)+1)
+        self.extra_header = [""]*(len(edge_additional_headers)+1)
 
         # make string ENSP to Uniprot id mapping dict
         string_to_uniprot_id_map = uniprot_make_name_to_id_mapping(
@@ -173,21 +173,6 @@ def run(self, data_file: Optional[str] = None) -> None:
                     protein = '.'.join(nat_string_id.split('.')[1:])
                     proteins.append(protein)
 
-                    if nat_string_id in string_to_uniprot_id_map:
-                        make_xref_node_and_edge_entries(protein,
-                                                        string_to_uniprot_id_map)
-
-                        write_node_edge_item(
-                            fh=edge,
-                            header=self.edge_header,
-                            data=[f"ENSEMBL:{protein}",
-                                  "biolink:xrefs",
-                                  f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}",
-                                  "biolink:xrefs",
-                                  "uniprot",
-                                  ] + extra_header
-                        )
-
                     if protein in self.protein_gene_map:
                         gene = self.protein_gene_map[protein]
                         if gene not in seen_genes:
@@ -215,7 +200,7 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     f"ENSEMBL:{protein}",
                                     "RO:0002205",
                                     "NCBI",
-                                ] + extra_header
+                                ] + self.extra_header
                             )
 
                         # write node data
@@ -239,6 +224,26 @@ def run(self, data_file: Optional[str] = None) -> None:
                     ]
                 )
 
+                # if we have an equivalent Uniprot ID for this Ensembl protein ID
+                # make an xref edge, and a node for the Uniprot ID
+                if nat_string_id in string_to_uniprot_id_map:
+                    uniprot_curie = \
+                        f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
+                    write_node_edge_item(
+                        fh=node,
+                        header=self.node_header,
+                        data=[uniprot_curie, "",
+                              protein_node_type, "", "", self.source_name])
+                    write_node_edge_item(
+                        fh=edge,
+                        header=self.edge_header,
+                        data=[f"ENSEMBL:{protein}",
+                              "biolink:xrefs",
+                              uniprot_curie,
+                              "biolink:xrefs",
+                              "uniprot",
+                              ] + self.extra_header)
+
 
 def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict:
     """Methods processes a line of text from Drug Central.
@@ -271,25 +276,3 @@ def parse_header(header_string: str, sep: str = ' ') -> List:
 
     return [i.replace('"', '') for i in header]
 
-
-def make_xref_node_and_edge_entries(protein_id: str,
-                                    string_to_uniprot_id_map: dict,
-                                    ensembl_prefix: str = "ENSEMBL",
-                                    xref_edge_label: str = "biolink:xrefs",
-                                    uniprot_prefix: str = "UniprotKB",
-                                    relation_term: str = "biolink:xrefs",
-                                    source_field: str = "uniprot"):
-    """Given a STRING-style ENSEMBL protein ID and a map to Uniprot ID map, make a new
-    node for the Uniprot ID, and an edge asserting a xref between the ENSEMBL ID and the
-    Uniprot ID
-
-    :param protein_id:
-    :param string_to_uniprot_id_map:
-    :param ensembl_prefix:
-    :param xref_edge_label:
-    :param uniprot_prefix:
-    :param relation_term:
-    :param source_field:
-    :return: None
-    """
-    pass
diff --git a/tests/resources/string/gene2ensembl.gz b/tests/resources/string/gene2ensembl.gz
index 94cf7dcc53a713bdd6be62269a75f97cf9397cf8..57434c4fa1521fc0bdba5fb2d39b438bd0a50bec 100644
GIT binary patch
literal 646
zcmV;10(t!(iwFoDgXvxX17~G!Win-Mb7gH}YyibpOK#gR5M4W`@C7W)|8sz#s|G>a
zeeQpY4o%u3<)X+=+F7LEhx2BrG_VU^-amf-fc7AvAY49vej$U0ZPM2M+}Fq$0k*h&
ze155e0d3S)Kj&3?%LYjkq-$h0$ontZY=>lw27}lz?{9SD9g~TZa`TSKJ0@wpD9m49
zAb?m1JEjT?$flRu<<|e0vltYLA;yf#%`OJqF}ECAOnK1)P8gf~7ZTCh@A=de1ImMT
zR77PH?s_|L{h1s#{08t4^zex(&G`hZ3$Q@K_kg8WasX3d+X`%};$*>Z0WSekQA{tY
z2{=~8T%I1lmBSsm3SO@sA=3FEC`>$y=#e2?a@kjTV9vozLmrq2p;e<j3)$lTtQz=&
z$!?xJFt7FCwLgbGx=wxcSYLZ-{BP>0$G(4bRa@b}m8nh*G_T!7OZ0)Mfp7&5o=k0D
zWgAzfu}_~&0ebKxbt*iVyh3aTPfnCoPnwyW<;XPl`3atgi+b@?#~Ymqt5Yji2TxkV
z1Q({3M0oMk_t_bqpi@m<SVPqZleE1nQ@v_!DA$=uNOtd^LoZ5Alryr2a4*xGJ&&r>
zZD_tN+A#<W8|bbbBWp_F3ECA{R_&oi(wQLJ9kMc2IV5<?*;7sKJkA0pkII;wF$>In
z^#oGq+nH(_zVwZT6T_GggNa@3$Rwq*&5SeZRF{}8o?6`RGaCp?*2~Q1`s|Q-W`nu4
zD|lrZ`}hP;8~|&bEGW7s5G6X+Nx0&As8ik2FHCLn*DZQRoyag&PntEh2UFeP?V}Or
gehLi6&1X;bjhN!T%m{X*_Ko-U7d)UTjRgw;08KG9c>n+a

literal 305
zcmV-10nYv(iwFohY3W`717~G!Win-Mb7gH}YygeYJ8r`;3<luYxl07TDITE6)F5cP
z=l-`SW!aUq6!vBj{`^y97}$o}$MgFE?Lk68*q)znWbkT5TJ2}Akud_4x;<ZSbugep
zt@@g)bc;sPM7q|9M(%G?Y><q#F^J9FPqgqbnK&qlcaw)nng?lqy+Hsmg<-0&fE3;C
z+r57na~Kq=M;J58Zblez$0%`RG36x-IAJaFFNtXFeZDot3fY5p6;auQd)^M`$K+7(
z0(=BI`^1pe`2_0%3j|z)rBCvJDQ#O}TN4KhUV@heQc(=gUrlgb6La=-!nub-&VlFA
zxkNf&1XUA{{L#5Xw(MoE@?p-48HRkA2vu`NV;7>W|KHTW7n9vQau~N?(uS1VKLY>&
DxQdVO


From 134a62e3838bca8a5c0e2139e249f78a04ce7c74 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 20:20:35 -0700
Subject: [PATCH 04/10] Moved code to write/edges for xref to Uniprot IDs

---
 .../transform_utils/string_ppi/string_ppi.py  | 39 ++++++++++---------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index b88a1e1e..17dddd49 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -213,6 +213,26 @@ def run(self, data_file: Optional[str] = None) -> None:
                                       protein_node_type, "", "", self.source_name]
                             )
 
+                        # if we have an equivalent Uniprot ID for this Ensembl protein
+                        # ID make an xref edge, and a node for the Uniprot ID
+                        if nat_string_id in string_to_uniprot_id_map:
+                            uniprot_curie = \
+                                f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
+                            write_node_edge_item(
+                                fh=node,
+                                header=self.node_header,
+                                data=[uniprot_curie, "",
+                                      protein_node_type, "", "", self.source_name])
+                            write_node_edge_item(
+                                fh=edge,
+                                header=self.edge_header,
+                                data=[f"ENSEMBL:{protein}",
+                                      "biolink:xrefs",
+                                      uniprot_curie,
+                                      "biolink:xrefs",
+                                      "uniprot",
+                                      ] + self.extra_header)
+
                 # write edge data
                 write_node_edge_item(
                     fh=edge,
@@ -224,25 +244,6 @@ def run(self, data_file: Optional[str] = None) -> None:
                     ]
                 )
 
-                # if we have an equivalent Uniprot ID for this Ensembl protein ID
-                # make an xref edge, and a node for the Uniprot ID
-                if nat_string_id in string_to_uniprot_id_map:
-                    uniprot_curie = \
-                        f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
-                    write_node_edge_item(
-                        fh=node,
-                        header=self.node_header,
-                        data=[uniprot_curie, "",
-                              protein_node_type, "", "", self.source_name])
-                    write_node_edge_item(
-                        fh=edge,
-                        header=self.edge_header,
-                        data=[f"ENSEMBL:{protein}",
-                              "biolink:xrefs",
-                              uniprot_curie,
-                              "biolink:xrefs",
-                              "uniprot",
-                              ] + self.extra_header)
 
 
 def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict:

From 4afc52946d338b20966458e402b64b1fc4c2c775 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 20:38:40 -0700
Subject: [PATCH 05/10] Moved code to write/edges for xref to Uniprot IDs

---
 .../transform_utils/string_ppi/string_ppi.py  | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index 17dddd49..987d500a 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -213,25 +213,25 @@ def run(self, data_file: Optional[str] = None) -> None:
                                       protein_node_type, "", "", self.source_name]
                             )
 
-                        # if we have an equivalent Uniprot ID for this Ensembl protein
-                        # ID make an xref edge, and a node for the Uniprot ID
-                        if nat_string_id in string_to_uniprot_id_map:
-                            uniprot_curie = \
-                                f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
-                            write_node_edge_item(
-                                fh=node,
-                                header=self.node_header,
-                                data=[uniprot_curie, "",
-                                      protein_node_type, "", "", self.source_name])
-                            write_node_edge_item(
-                                fh=edge,
-                                header=self.edge_header,
-                                data=[f"ENSEMBL:{protein}",
-                                      "biolink:xrefs",
-                                      uniprot_curie,
-                                      "biolink:xrefs",
-                                      "uniprot",
-                                      ] + self.extra_header)
+                    # if we have an equivalent Uniprot ID for this Ensembl protein
+                    # ID make an xref edge, and a node for the Uniprot ID
+                    if nat_string_id in string_to_uniprot_id_map:
+                        uniprot_curie = \
+                            f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
+                        write_node_edge_item(
+                            fh=node,
+                            header=self.node_header,
+                            data=[uniprot_curie, "",
+                                  protein_node_type, "", "", self.source_name])
+                        write_node_edge_item(
+                            fh=edge,
+                            header=self.edge_header,
+                            data=[f"ENSEMBL:{protein}",
+                                  "biolink:xrefs",
+                                  uniprot_curie,
+                                  "biolink:xrefs",
+                                  "uniprot",
+                                  ] + self.extra_header)
 
                 # write edge data
                 write_node_edge_item(

From 44e7df1527f0d5a4e9fb4b35cc5037a2a909dc2a Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 20:55:15 -0700
Subject: [PATCH 06/10] Moved code to write protein nodes out of block that
 write gene -> protein edges - I think that's a bug

---
 .../transform_utils/string_ppi/string_ppi.py  | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index 987d500a..c199cab9 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -203,35 +203,35 @@ def run(self, data_file: Optional[str] = None) -> None:
                                 ] + self.extra_header
                             )
 
-                        # write node data
-                        if protein not in seen_proteins:
-                            seen_proteins.add(protein)
-                            write_node_edge_item(
-                                fh=node,
-                                header=self.node_header,
-                                data=[f"ENSEMBL:{protein}", "",
-                                      protein_node_type, "", "", self.source_name]
-                            )
-
-                    # if we have an equivalent Uniprot ID for this Ensembl protein
-                    # ID make an xref edge, and a node for the Uniprot ID
-                    if nat_string_id in string_to_uniprot_id_map:
-                        uniprot_curie = \
-                            f"UniprotKB:{string_to_uniprot_id_map[nat_string_id]}"
+                    # write node data
+                    if protein not in seen_proteins:
+                        seen_proteins.add(protein)
                         write_node_edge_item(
                             fh=node,
                             header=self.node_header,
-                            data=[uniprot_curie, "",
-                                  protein_node_type, "", "", self.source_name])
-                        write_node_edge_item(
-                            fh=edge,
-                            header=self.edge_header,
-                            data=[f"ENSEMBL:{protein}",
-                                  "biolink:xrefs",
-                                  uniprot_curie,
-                                  "biolink:xrefs",
-                                  "uniprot",
-                                  ] + self.extra_header)
+                            data=[f"ENSEMBL:{protein}", "",
+                                  protein_node_type, "", "", self.source_name]
+                        )
+
+                        # if we have an equivalent Uniprot ID for this Ensembl protein
+                        # ID make an xref edge, and a node for the Uniprot ID
+                        if protein in string_to_uniprot_id_map:
+                            uniprot_curie = \
+                                f"UniprotKB:{string_to_uniprot_id_map[protein]}"
+                            write_node_edge_item(
+                                fh=node,
+                                header=self.node_header,
+                                data=[uniprot_curie, "",
+                                      protein_node_type, "", "", self.source_name])
+                            write_node_edge_item(
+                                fh=edge,
+                                header=self.edge_header,
+                                data=[f"ENSEMBL:{protein}",
+                                      "biolink:xrefs",
+                                      uniprot_curie,
+                                      "biolink:xrefs",
+                                      "uniprot",
+                                      ] + self.extra_header)
 
                 # write edge data
                 write_node_edge_item(

From d7f3d9d48edf4a9fa9d9b4b80779730777736d40 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 21:38:08 -0700
Subject: [PATCH 07/10] Better tests for nodes and edges tsv files in STRING

---
 .../transform_utils/string_ppi/string_ppi.py  |  1 -
 tests/test_string.py                          | 46 ++++++++++++++-----
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index c199cab9..bc321003 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -245,7 +245,6 @@ def run(self, data_file: Optional[str] = None) -> None:
                 )
 
 
-
 def parse_stringdb_interactions(this_line: str, header_items: List) -> Dict:
     """Methods processes a line of text from Drug Central.
 
diff --git a/tests/test_string.py b/tests/test_string.py
index d7ccbf5d..d271b299 100644
--- a/tests/test_string.py
+++ b/tests/test_string.py
@@ -1,5 +1,6 @@
 import os
 import tempfile
+import pandas as pd
 from unittest import TestCase, skip
 
 from parameterized import parameterized
@@ -9,15 +10,11 @@
 
 class TestString(TestCase):
     """Tests the string ingest"""
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls.input_dir = "tests/resources/string/"
-        cls.output_dir = tempfile.gettempdir()
-        cls.string_output_dir = os.path.join(cls.output_dir, "STRING")
-        cls.string = StringTransform(cls.input_dir, cls.output_dir)
-
     def setUp(self) -> None:
-        pass
+        self.input_dir = "tests/resources/string/"
+        self.output_dir = tempfile.gettempdir()
+        self.string_output_dir = os.path.join(self.output_dir, "STRING")
+        self.string = StringTransform(self.input_dir, self.output_dir)
 
     @parameterized.expand([
     ['ensembl2ncbi_map', dict, 'ENSG00000121410', 1],
@@ -53,7 +50,32 @@ def test_run(self):
         self.assertTrue(isinstance(self.string.run, object))
         self.string.run()
         self.assertTrue(os.path.isdir(self.string_output_dir))
-        self.assertTrue(
-            os.path.isfile(os.path.join(self.string_output_dir, "nodes.tsv")))
-        self.assertTrue(
-            os.path.isfile(os.path.join(self.string_output_dir, "edges.tsv")))
+
+    def test_nodes_file(self):
+        self.string.run()
+        node_file = os.path.join(self.string_output_dir, "nodes.tsv")
+        self.assertTrue(os.path.isfile(node_file))
+        node_df = pd.read_csv(node_file, sep="\t", header=0)
+        self.assertEqual((10, 6), node_df.shape)
+        self.assertEqual(['id', 'name', 'category', 'description', 'alias',
+                          'provided_by'], list(node_df.columns))
+        self.assertListEqual(['ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298',
+                              'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445',
+                              'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801',
+                              'ENSEMBL:ENSP00000466298', 'ENSEMBL:ENSP00000232564',
+                              'ENSEMBL:ENSP00000393379', 'ENSEMBL:ENSP00000371253'],
+                             list(node_df.id.unique()))
+
+    def test_edges_file(self):
+        self.string.run()
+        edge_file = os.path.join(self.string_output_dir, "edges.tsv")
+        self.assertTrue(os.path.isfile(edge_file))
+        edge_df = pd.read_csv(edge_file, sep="\t", header=0)
+        self.assertEqual((9, 19), edge_df.shape)
+        self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
+                          'combined_score', 'neighborhood', 'neighborhood_transferred',
+                          'fusion', 'cooccurence', 'homology', 'coexpression',
+                          'coexpression_transferred', 'experiments',
+                          'experiments_transferred', 'database', 'database_transferred',
+                          'textmining', 'textmining_transferred', ],
+                         list(edge_df.columns))

From d6a052f4e2cd0a5f99588688b848d3ed71d751c8 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 21:48:29 -0700
Subject: [PATCH 08/10] Added some ids to test ids make sure we're getting
 Ensembl to Uniprot mapping stuff

---
 .../string/HUMAN_9606_idmapping.dat.gz          | Bin 241 -> 279 bytes
 tests/test_string.py                            |  15 ++++++++-------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/resources/string/HUMAN_9606_idmapping.dat.gz b/tests/resources/string/HUMAN_9606_idmapping.dat.gz
index b1abe29521e7f26e49c271a258b9ff1fbfabe214..d6ac44b2e39092c9501f6c2365a48a8ec7cb7d48 100644
GIT binary patch
delta 258
zcmV+d0sa2*0ha;>ABzYG1)z}zAAiA4!!Qhn&)K^S*@+z|$99yh+E${D3OH2*1qq22
zY4HA{+E|y4;gj`!e_|)WpeL)E?mg^3znA&6yj4iX<n5waXVoMe{XBNDt+u<Uo*x%k
zKHyRjgLv`@F(Q_&fKHedvShVO5E^3kkP<L`hJ|ecH5%=R+?4_NcYq{F9CHE+4XOT(
zb$mmBbJrf#IN@Pvu6l-{8T%<7{&Bm1RZS?NFElejI%$2W+0Wcvdi%d;Rk6-j>WRgi
zw>h9}sl67IF<b#S(L4!`z_Hpq)MYhS-h#a;s!f33(#D)3MfVkVFCT5F@6~1G2ZYF~
I=_CRG0L@2w3jhEB

delta 220
zcmV<203-jG0`UO`ABzYG{BV&5AAidV!Y~j;*Xh3qGntu8vTS`89~7aet0Gdtg%!mA
z$D&wMA6Z?_xyd9c2m?D)HmBlxxy`ddJ~GHeSv4z{=_)Cnep)wmwc4L+v)jzlti!os
z0Sz1?LaI)jf{mC{cI>Q=5E6?7hzU^J5fv9fi)1~s|1ki+2WW)UBcO8EYE9p`u8#=t
zPSsQO8v(lT(+hN=?+5t!$NlwS$|8q$Aq^GzrR}GdF!Yo7^uK$}c$qECR+Qeid7yM|
WgH_Z%TmX2rlH36Zb3HNM0RRAvCT6Sv

diff --git a/tests/test_string.py b/tests/test_string.py
index d271b299..c7a84122 100644
--- a/tests/test_string.py
+++ b/tests/test_string.py
@@ -56,14 +56,15 @@ def test_nodes_file(self):
         node_file = os.path.join(self.string_output_dir, "nodes.tsv")
         self.assertTrue(os.path.isfile(node_file))
         node_df = pd.read_csv(node_file, sep="\t", header=0)
-        self.assertEqual((10, 6), node_df.shape)
+        self.assertEqual((11, 6), node_df.shape)
         self.assertEqual(['id', 'name', 'category', 'description', 'alias',
                           'provided_by'], list(node_df.columns))
-        self.assertListEqual(['ENSEMBL:ENSP00000000233', 'ENSEMBL:ENSP00000272298',
-                              'ENSEMBL:ENSP00000253401', 'ENSEMBL:ENSP00000401445',
-                              'ENSEMBL:ENSP00000418915', 'ENSEMBL:ENSP00000327801',
-                              'ENSEMBL:ENSP00000466298', 'ENSEMBL:ENSP00000232564',
-                              'ENSEMBL:ENSP00000393379', 'ENSEMBL:ENSP00000371253'],
+        self.assertListEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
+                              'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
+                              'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
+                              'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
+                              'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
+                              'ENSEMBL:ENSP00000371253'],
                              list(node_df.id.unique()))
 
     def test_edges_file(self):
@@ -71,7 +72,7 @@ def test_edges_file(self):
         edge_file = os.path.join(self.string_output_dir, "edges.tsv")
         self.assertTrue(os.path.isfile(edge_file))
         edge_df = pd.read_csv(edge_file, sep="\t", header=0)
-        self.assertEqual((9, 19), edge_df.shape)
+        self.assertEqual((10, 19), edge_df.shape)
         self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
                           'combined_score', 'neighborhood', 'neighborhood_transferred',
                           'fusion', 'cooccurence', 'homology', 'coexpression',

From e3887192d0d373c973d50f6b64b1e35a88dcc5d8 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 16 Jun 2020 21:58:11 -0700
Subject: [PATCH 09/10] Test failing for trivial reason

---
 tests/test_string.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_string.py b/tests/test_string.py
index c7a84122..d29e2f4b 100644
--- a/tests/test_string.py
+++ b/tests/test_string.py
@@ -59,7 +59,7 @@ def test_nodes_file(self):
         self.assertEqual((11, 6), node_df.shape)
         self.assertEqual(['id', 'name', 'category', 'description', 'alias',
                           'provided_by'], list(node_df.columns))
-        self.assertListEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
+        self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
                               'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
                               'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
                               'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',

From cf1672d984ef16b8a82f5d210c078b9d6acc8141 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Wed, 17 Jun 2020 16:48:14 -0700
Subject: [PATCH 10/10] Refactor such that Uniprot IDs are node properties in
 xrefs column (instead of edges)

---
 .../string_ppi/node_header.json               |  3 +-
 .../transform_utils/string_ppi/string_ppi.py  | 43 ++++++++-----------
 tests/test_string.py                          | 11 +++--
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/kg_covid_19/transform_utils/string_ppi/node_header.json b/kg_covid_19/transform_utils/string_ppi/node_header.json
index c2b99f0d..d761756f 100644
--- a/kg_covid_19/transform_utils/string_ppi/node_header.json
+++ b/kg_covid_19/transform_utils/string_ppi/node_header.json
@@ -4,5 +4,6 @@
     "category",
     "description",
     "alias",
+    "xrefs",
     "provided_by"
-]
\ No newline at end of file
+]
diff --git a/kg_covid_19/transform_utils/string_ppi/string_ppi.py b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
index bc321003..4cc39261 100644
--- a/kg_covid_19/transform_utils/string_ppi/string_ppi.py
+++ b/kg_covid_19/transform_utils/string_ppi/string_ppi.py
@@ -18,16 +18,15 @@
 Write node and edge headers that look something like:
 
 Node: 
-id  name    category
-protein:1234    TBX4    Protein 
+id  name    category    xrefs   provided_by
+protein:1234    TBX4    biolink:Protein UniprotKB:123456    STRING 
+
+xrefs contains the UniprotKB id for the protein, if available
 
 Edge: 
 subject edge_label  object  relation
 protein:1234    interacts_with  protein:4567    RO:0002434
 
-Also write edges that create xrefs between ENSP ids and UniprotKB IDs, like so:
-subject edge_label  object  relation
-protein:1234    bl:xrefs  protein:4567    RO:0002434
 
 """
 
@@ -188,6 +187,7 @@ def run(self, data_file: Optional[str] = None) -> None:
                                     'biolink:Gene',
                                     gene_informations['description'],
                                     f"NCBIGene:{self.ensembl2ncbi_map[gene]}",
+                                    "",
                                     self.source_name
                                 ]
                             )
@@ -206,32 +206,25 @@ def run(self, data_file: Optional[str] = None) -> None:
                     # write node data
                     if protein not in seen_proteins:
                         seen_proteins.add(protein)
-                        write_node_edge_item(
-                            fh=node,
-                            header=self.node_header,
-                            data=[f"ENSEMBL:{protein}", "",
-                                  protein_node_type, "", "", self.source_name]
-                        )
 
                         # if we have an equivalent Uniprot ID for this Ensembl protein
                         # ID make an xref edge, and a node for the Uniprot ID
+                        uniprot_curie = ''
                         if protein in string_to_uniprot_id_map:
                             uniprot_curie = \
                                 f"UniprotKB:{string_to_uniprot_id_map[protein]}"
-                            write_node_edge_item(
-                                fh=node,
-                                header=self.node_header,
-                                data=[uniprot_curie, "",
-                                      protein_node_type, "", "", self.source_name])
-                            write_node_edge_item(
-                                fh=edge,
-                                header=self.edge_header,
-                                data=[f"ENSEMBL:{protein}",
-                                      "biolink:xrefs",
-                                      uniprot_curie,
-                                      "biolink:xrefs",
-                                      "uniprot",
-                                      ] + self.extra_header)
+
+                        write_node_edge_item(
+                            fh=node,
+                            header=self.node_header,
+                            data=[f"ENSEMBL:{protein}", "",
+                                  protein_node_type,
+                                  "",
+                                  "",
+                                  uniprot_curie,  # xref
+                                  self.source_name]
+                        )
+
 
                 # write edge data
                 write_node_edge_item(
diff --git a/tests/test_string.py b/tests/test_string.py
index d29e2f4b..5588db80 100644
--- a/tests/test_string.py
+++ b/tests/test_string.py
@@ -56,23 +56,26 @@ def test_nodes_file(self):
         node_file = os.path.join(self.string_output_dir, "nodes.tsv")
         self.assertTrue(os.path.isfile(node_file))
         node_df = pd.read_csv(node_file, sep="\t", header=0)
-        self.assertEqual((11, 6), node_df.shape)
-        self.assertEqual(['id', 'name', 'category', 'description', 'alias',
+        self.assertEqual((10, 7), node_df.shape)
+        self.assertEqual(['id', 'name', 'category', 'description', 'alias', 'xrefs',
                           'provided_by'], list(node_df.columns))
-        self.assertCountEqual(['UniprotKB:P84085', 'ENSEMBL:ENSP00000000233',
+        self.assertCountEqual(['ENSEMBL:ENSP00000000233',
                               'ENSEMBL:ENSP00000272298', 'ENSEMBL:ENSP00000253401',
                               'ENSEMBL:ENSP00000401445', 'ENSEMBL:ENSP00000418915',
                               'ENSEMBL:ENSP00000327801', 'ENSEMBL:ENSP00000466298',
                               'ENSEMBL:ENSP00000232564', 'ENSEMBL:ENSP00000393379',
                               'ENSEMBL:ENSP00000371253'],
                              list(node_df.id.unique()))
+        self.assertCountEqual('UniprotKB:P84085',
+                              node_df.loc[node_df['id'] ==
+                                          'ENSEMBL:ENSP00000000233'].xrefs.item())
 
     def test_edges_file(self):
         self.string.run()
         edge_file = os.path.join(self.string_output_dir, "edges.tsv")
         self.assertTrue(os.path.isfile(edge_file))
         edge_df = pd.read_csv(edge_file, sep="\t", header=0)
-        self.assertEqual((10, 19), edge_df.shape)
+        self.assertEqual((9, 19), edge_df.shape)
         self.assertEqual(['subject', 'edge_label', 'object', 'relation', 'provided_by',
                           'combined_score', 'neighborhood', 'neighborhood_transferred',
                           'fusion', 'cooccurence', 'homology', 'coexpression',