Skip to content

Commit

Permalink
Merge b3e5753 into d06b6e8
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed May 26, 2020
2 parents d06b6e8 + b3e5753 commit b9464bb
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 7 deletions.
3 changes: 3 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
# drug - target interactions from Drug Central
url: http://unmtid-shinyapps.net/download/drug.target.interaction.tsv.gz
local_name: drug.target.interaction.tsv.gz
-
url: http://unmtid-shinyapps.net/download/tcrd.zip
local_name: tcrd.zip

#
# PPI from STRING DB
Expand Down
65 changes: 60 additions & 5 deletions kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import csv
import gzip
import logging
import os
import re
import tempfile
from collections import defaultdict

from typing import Dict, List, Optional

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, ItemInDictNotFound, parse_header, data_to_dict
get_item_by_priority, ItemInDictNotFound, parse_header, data_to_dict, \
unzip_to_tempdir

"""
Ingest drug - drug target interactions from Drug Central
Expand All @@ -27,6 +30,7 @@ class DrugCentralTransform(Transform):
def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
source_name = "drug_central"
super().__init__(source_name, input_dir, output_dir) # set some variables
self.node_header = ['id', 'name', 'category', 'tclin', 'tchem']

def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> None:
"""Method is called and performs needed transformations to process the Drug
Expand All @@ -35,6 +39,7 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->

interactions_file = os.path.join(self.input_base_dir,
"drug.target.interaction.tsv.gz")
tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip")
os.makedirs(self.output_dir, exist_ok=True)
drug_node_type = "biolink:Drug"
gene_curie_prefix = "UniProtKB:"
Expand All @@ -45,6 +50,13 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
self.edge_header = ['subject', 'edge_label', 'object', 'relation',
'provided_by', 'comment']

# unzip tcrd.zip and get tchem and tclin filenames
tempdir = tempfile.mkdtemp()
(tclin_file, tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir)

tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot')
tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot')

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
gzip.open(interactions_file, 'rt') as interactions:
Expand Down Expand Up @@ -79,15 +91,22 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
header=self.node_header,
data=[drug_id,
items_dict['DRUG_NAME'],
drug_node_type])
drug_node_type,
str(False),
str(False)])

for gene_id in gene_ids:
gene_id = gene_curie_prefix + gene_id
is_tclin = True if gene_ids[0] in tclin_dict else False
is_tchem = True if gene_ids[0] in tchem_dict else False

write_node_edge_item(fh=node,
header=self.node_header,
data=[gene_id,
items_dict['GENE'],
gene_node_type])
gene_node_type,
str(is_tclin),
str(is_tchem)])

# WRITE EDGES
# ['subject', 'edge_label', 'object', 'relation', 'provided_by',
Expand All @@ -104,6 +123,42 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
return None


def tsv_to_dict(input_file: str, col_for_key: str) -> dict:
this_dict: dict = defaultdict(list)
with open(input_file) as file:
reader = csv.DictReader(file, delimiter='\t')
for row in reader:
this_dict[row[col_for_key]] = row
return this_dict


def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]:
unzip_to_tempdir(zip_file, output_dir)
# get tclin filename
tclin_files = \
[f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)]
if len(tclin_files) > 1:
raise RuntimeError("Found more than one tclin file:\n%s" %
"\n".join(tclin_files))
elif len(tclin_files) < 1:
raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file)
else:
tclin_file: str = os.path.join(output_dir, tclin_files[0])

# get tchem filename
tchem_files = \
[f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)]
if len(tchem_files) > 1:
raise RuntimeError("Found more than one tchem file:\n%s" %
"\n".join(tchem_files))
elif len(tchem_files) < 1:
raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file)
else:
tchem_file: str = os.path.join(output_dir, tchem_files[0])

return [tclin_file, tchem_file]


def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
"""Methods processes a line of text from Drug Central.
Expand Down
3 changes: 3 additions & 0 deletions tests/resources/drug_central/tchem_SNIPPET.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id
P42338 PK3CB_HUMAN copanlisib 8.431797981262207 IC50 INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/24170767 COC1=C(OCCCN2CCOCC2)C=CC2=C1N=C(NC(=O)C1=CN=C(N)N=C1)N1CCN=C21 CHEMBL3218576
P21917 DRD4_HUMAN brexpiprazole 8.200659451 Ki AGONIST SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/24947465 O=C1NC2=C(C=C1)C=CC(OCCCCN1CCN(CC1)C1=C3C=CSC3=CC=C1)=C2 CHEMBL2105760
3 changes: 3 additions & 0 deletions tests/resources/drug_central/tclin_SNIPPET.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id
P19838 NFKB1_HUMAN cepharanthine INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/31132753 [H][C@]12CC3=CC(OC4=CC=C(C[C@@H]5N(C)CCC6=CC7=C(OCO7)C(OC7=C(OC)C=C(CCN1C)C2=C7)=C56)C=C4)=C(OC)C=C3 CHEMBL449782
Q13131 AAPK1_HUMAN cepharanthine INHIBITOR SCIENTIFIC LITERATURE http://www.ncbi.nlm.nih.gov/pubmed/31132753 [H][C@]12CC3=CC(OC4=CC=C(C[C@@H]5N(C)CCC6=CC7=C(OCO7)C(OC7=C(OC)C=C(CCN1C)C2=C7)=C56)C=C4)=C(OC)C=C3 CHEMBL449782
Binary file added tests/resources/drug_central/test.zip
Binary file not shown.
28 changes: 26 additions & 2 deletions tests/test_drug_central.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import os
import tempfile
import unittest

from kg_covid_19.transform_utils.drug_central.drug_central import \
parse_drug_central_line
parse_drug_central_line, unzip_and_get_tclin_tchem, tsv_to_dict
from kg_covid_19.utils.transform_utils import parse_header
from parameterized import parameterized


class TestDrugCentral(unittest.TestCase):

def setUp(self) -> None:
self.dti_fh = open('tests/resources/drug.target.interaction_SNIPPET.tsv', 'rt')
self.dti_fh = open(
'tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv', 'rt')

@parameterized.expand([
('STRUCT_ID', '4'),
Expand Down Expand Up @@ -38,4 +41,25 @@ def test_parse_drug_central_line(self, key, value):
self.assertTrue(key in parsed)
self.assertEqual(value, parsed[key])

@parameterized.expand([
('tclin', 'tests/resources/drug_central/tclin_SNIPPET.tsv', 2, 'Q13131', 'drug_name', 'cepharanthine'),
('tchem', 'tests/resources/drug_central/tchem_SNIPPET.tsv', 2, 'P21917', 'drug_name', 'brexpiprazole'),
])
def test_tsv_to_dict(self, name, file, expected_rows, test_key, sub_key, test_val) -> None:
ret_val = tsv_to_dict(file, 'uniprot')
self.assertTrue(isinstance(ret_val, dict))
self.assertEqual(len(ret_val), expected_rows)
self.assertTrue(isinstance(ret_val, dict))
self.assertTrue(test_key in ret_val)
self.assertTrue(sub_key in ret_val.get(test_key))
self.assertEqual(ret_val[test_key][sub_key], test_val)

def test_unzip_and_get_tclin_tchem(self) -> None:
zip_file = "tests/resources/drug_central/test.zip"
tempdir = tempfile.mkdtemp()
(tclin, tchem) = unzip_and_get_tclin_tchem(zip_file, tempdir)
self.assertTrue(isinstance(tclin, str))
self.assertTrue(isinstance(tchem, str))
self.assertEqual(tclin, os.path.join(tempdir, 'tclin_05122020.tsv'))
self.assertEqual(tchem, os.path.join(tempdir, 'tchem_drugs_05122020.tsv'))

0 comments on commit b9464bb

Please sign in to comment.