diff --git a/download.yaml b/download.yaml index c69918c2..12fb763c 100644 --- a/download.yaml +++ b/download.yaml @@ -30,6 +30,9 @@ # drug - target interactions from Drug Central url: http://unmtid-shinyapps.net/download/drug.target.interaction.tsv.gz local_name: drug.target.interaction.tsv.gz +- + url: http://unmtid-shinyapps.net/download/tcrd.zip + local_name: tcrd.zip # # PPI from STRING DB diff --git a/kg_covid_19/transform_utils/drug_central/drug_central.py b/kg_covid_19/transform_utils/drug_central/drug_central.py index 6c8f2a01..f55f0493 100644 --- a/kg_covid_19/transform_utils/drug_central/drug_central.py +++ b/kg_covid_19/transform_utils/drug_central/drug_central.py @@ -1,16 +1,19 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - - +import csv import gzip import logging import os +import re +import tempfile +from collections import defaultdict from typing import Dict, List, Optional from kg_covid_19.transform_utils.transform import Transform from kg_covid_19.utils.transform_utils import write_node_edge_item, \ - get_item_by_priority, ItemInDictNotFound, parse_header, data_to_dict + get_item_by_priority, ItemInDictNotFound, parse_header, data_to_dict, \ + unzip_to_tempdir """ Ingest drug - drug target interactions from Drug Central @@ -27,6 +30,7 @@ class DrugCentralTransform(Transform): def __init__(self, input_dir: str = None, output_dir: str = None) -> None: source_name = "drug_central" super().__init__(source_name, input_dir, output_dir) # set some variables + self.node_header = ['id', 'name', 'category', 'tclin', 'tchem'] def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> None: """Method is called and performs needed transformations to process the Drug @@ -35,6 +39,7 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz") + tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip") os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" gene_curie_prefix = "UniProtKB:" @@ -45,6 +50,13 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> self.edge_header = ['subject', 'edge_label', 'object', 'relation', 'provided_by', 'comment'] + # unzip tcrd.zip and get tchem and tclin filenames + tempdir = tempfile.mkdtemp() + (tclin_file, tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir) + + tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot') + tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot') + with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: @@ -79,15 +91,22 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> header=self.node_header, data=[drug_id, items_dict['DRUG_NAME'], - drug_node_type]) + drug_node_type, + str(False), + str(False)]) for gene_id in gene_ids: gene_id = gene_curie_prefix + gene_id + is_tclin = True if gene_ids[0] in tclin_dict else False + is_tchem = True if gene_ids[0] in tchem_dict else False + write_node_edge_item(fh=node, header=self.node_header, data=[gene_id, items_dict['GENE'], - gene_node_type]) + gene_node_type, + str(is_tclin), + str(is_tchem)]) # WRITE EDGES # ['subject', 'edge_label', 'object', 'relation', 'provided_by', @@ -104,6 +123,42 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> return None +def tsv_to_dict(input_file: str, col_for_key: str) -> dict: + this_dict: dict = defaultdict(list) + with open(input_file) as file: + reader = csv.DictReader(file, delimiter='\t') + for row in reader: + this_dict[row[col_for_key]] = row + return this_dict + + +def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]: + unzip_to_tempdir(zip_file, output_dir) + # get tclin filename + tclin_files = \ + [f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)] + if len(tclin_files) > 1: + raise RuntimeError("Found more than one tclin file:\n%s" % + "\n".join(tclin_files)) + elif len(tclin_files) < 1: + raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file) + else: + tclin_file: str = os.path.join(output_dir, tclin_files[0]) + + # get tchem filename + tchem_files = \ + [f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)] + if len(tchem_files) > 1: + raise RuntimeError("Found more than one tchem file:\n%s" % + "\n".join(tchem_files)) + elif len(tchem_files) < 1: + raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file) + else: + tchem_file: str = os.path.join(output_dir, tchem_files[0]) + + return [tclin_file, tchem_file] + + def parse_drug_central_line(this_line: str, header_items: List) -> Dict: """Methods processes a line of text from Drug Central. diff --git a/tests/resources/drug.target.interaction_SNIPPET.tsv b/tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv similarity index 100% rename from tests/resources/drug.target.interaction_SNIPPET.tsv rename to tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv diff --git a/tests/resources/drug_central/tchem_SNIPPET.tsv b/tests/resources/drug_central/tchem_SNIPPET.tsv new file mode 100644 index 00000000..54d01f54 --- /dev/null +++ b/tests/resources/drug_central/tchem_SNIPPET.tsv @@ -0,0 +1,3 @@ +uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id +P42338 PK3CB_HUMAN copanlisib 8.431797981262207 IC50 INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/24170767 COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(N)N=C1)N1CCN=C21 CHEMBL3218576 +P21917 DRD4_HUMAN brexpiprazole 8.200659451 Ki AGONIST SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/24947465 O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3=CC=C1)=C2 CHEMBL2105760 diff --git a/tests/resources/drug_central/tclin_SNIPPET.tsv b/tests/resources/drug_central/tclin_SNIPPET.tsv new file mode 100644 index 00000000..7c9bed42 --- /dev/null +++ b/tests/resources/drug_central/tclin_SNIPPET.tsv @@ -0,0 +1,3 @@ +uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id +P19838 NFKB1_HUMAN cepharanthine INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/31132753 [H][C@]12CC3=CC(OC4=CC=C(C[C@@H]5N(C)CCC6=CC7=C(OCO7)C(OC7=C(OC)C=C(CCN1C)C2=C7)=C56)C=C4)=C(OC)C=C3 CHEMBL449782 +Q13131 AAPK1_HUMAN cepharanthine INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/31132753 [H][C@]12CC3=CC(OC4=CC=C(C[C@@H]5N(C)CCC6=CC7=C(OCO7)C(OC7=C(OC)C=C(CCN1C)C2=C7)=C56)C=C4)=C(OC)C=C3 CHEMBL449782 diff --git a/tests/resources/drug_central/test.zip b/tests/resources/drug_central/test.zip new file mode 100644 index 00000000..4ff8c37c Binary files /dev/null and b/tests/resources/drug_central/test.zip differ diff --git a/tests/test_drug_central.py b/tests/test_drug_central.py index 7171863c..01732f44 100644 --- a/tests/test_drug_central.py +++ b/tests/test_drug_central.py @@ -1,7 +1,9 @@ +import os +import tempfile import unittest from kg_covid_19.transform_utils.drug_central.drug_central import \ - parse_drug_central_line + parse_drug_central_line, unzip_and_get_tclin_tchem, tsv_to_dict from kg_covid_19.utils.transform_utils import parse_header from parameterized import parameterized @@ -9,7 +11,8 @@ class TestDrugCentral(unittest.TestCase): def setUp(self) -> None: - self.dti_fh = open('tests/resources/drug.target.interaction_SNIPPET.tsv', 'rt') + self.dti_fh = open( + 'tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv', 'rt') @parameterized.expand([ ('STRUCT_ID', '4'), @@ -38,4 +41,25 @@ def test_parse_drug_central_line(self, key, value): self.assertTrue(key in parsed) self.assertEqual(value, parsed[key]) + @parameterized.expand([ + ('tclin', 'tests/resources/drug_central/tclin_SNIPPET.tsv', 2, 'Q13131', 'drug_name', 'cepharanthine'), + ('tchem', 'tests/resources/drug_central/tchem_SNIPPET.tsv', 2, 'P21917', 'drug_name', 'brexpiprazole'), + ]) + def test_tsv_to_dict(self, name, file, expected_rows, test_key, sub_key, test_val) -> None: + ret_val = tsv_to_dict(file, 'uniprot') + self.assertTrue(isinstance(ret_val, dict)) + self.assertEqual(len(ret_val), expected_rows) + self.assertTrue(isinstance(ret_val, dict)) + self.assertTrue(test_key in ret_val) + self.assertTrue(sub_key in ret_val.get(test_key)) + self.assertEqual(ret_val[test_key][sub_key], test_val) + + def test_unzip_and_get_tclin_tchem(self) -> None: + zip_file = "tests/resources/drug_central/test.zip" + tempdir = tempfile.mkdtemp() + (tclin, tchem) = unzip_and_get_tclin_tchem(zip_file, tempdir) + self.assertTrue(isinstance(tclin, str)) + self.assertTrue(isinstance(tchem, str)) + self.assertEqual(tclin, os.path.join(tempdir, 'tclin_05122020.tsv')) + self.assertEqual(tchem, os.path.join(tempdir, 'tchem_drugs_05122020.tsv'))