Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace internal tabs in data to be written out as TSVs #207

Merged
merged 2 commits into from
Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import gzip
import logging
import zipfile
from typing import Any, Dict, List, Union
from typing import Any, Dict, List, Union, TextIO
from tqdm import tqdm # type: ignore


class TransformError(Exception):
"""Base class for other exceptions"""
pass
Expand Down Expand Up @@ -63,15 +62,23 @@ def get_header_items(table_data: Any) -> List:
return header_items


def write_node_edge_item(fh: Any, header: List, data: List, sep: str = '\t'):
def write_node_edge_item(fh: TextIO, header: List, data: List, sep: str = '\t',
sanitize_sep_char=True):
"""Write out a single line for a node or an edge in *.tsv
:param fh: file handle of node or edge file
:param header: list of header items
:param data: data for line to write out
:param sep: separator [\t]
:param sanitize_sep_char: replace sep character in data with hex
present in `data`
"""
if len(header) != len(data):
raise Exception('Header and data are not the same length.')

if sanitize_sep_char:
for i in range(len(data)):
data[i] = data[i].replace(sep, hex(ord(sep)))

try:
fh.write(sep.join(data) + "\n")
except IOError:
Expand Down
45 changes: 45 additions & 0 deletions tests/test_transform_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import tempfile
import unittest

from kg_covid_19.utils import write_node_edge_item


class TestTransformUtils(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.header = ['id', 'name', 'category']
cls.valid_data = ['id1234', '1234', 'biolink:Gene']

def setUp(self) -> None:
self.tempdir = tempfile.gettempdir()
self.outfile = os.path.join(self.tempdir, 'some.tsv')
self.fh = open(self.outfile, 'w')

def test_write_node_edge_item_bad_fh(self):
with self.assertRaises(Exception):
write_node_edge_item(fh='', header=self.header, data=self.valid_data)

def test_write_node_edge_item(self):
write_node_edge_item(fh=self.fh, header=self.header, data=self.valid_data)
self.fh.close()
self.assertTrue(os.path.exists(self.outfile))
with open(self.outfile, 'r') as tsvfile:
lines = tsvfile.read().split('\n')
self.assertEqual(['id1234', '1234', 'biolink:Gene'], lines[0].split('\t'))

def test_write_node_edge_item_with_tabs_in_data(self):
write_node_edge_item(fh=self.fh,
header=self.header,
data=['id1234', '1234', 'biolink:Gene\tbiolink:Gene\t'],
sanitize_sep_char=True)
self.fh.close()
self.assertTrue(os.path.exists(self.outfile))
with open(self.outfile, 'r') as tsvfile:
lines = tsvfile.read().split('\n')
self.assertEqual(['id1234',
'1234',
'biolink:Gene0x9biolink:Gene0x9'],
lines[0].split('\t'))