Skip to content

Commit

Permalink
Merge pull request #96 from Knowledge-Graph-Hub/fix_data_errors_drug_…
Browse files Browse the repository at this point in the history
…central

Fix data errors drug central
  • Loading branch information
justaddcoffee committed Apr 15, 2020
2 parents 7175d04 + 62b1a46 commit 487136a
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 29 deletions.
65 changes: 36 additions & 29 deletions kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import write_node_edge_item, \
get_item_by_priority, ItemInDictNotFound, parse_header
get_item_by_priority, ItemInDictNotFound, parse_header, data_to_dict

"""
Ingest drug - drug target interactions from Drug Central
Expand All @@ -28,13 +28,17 @@ def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
source_name = "drug_central"
super().__init__(source_name, input_dir, output_dir) # set some variables

def run(self) -> None:
"""Method is called and performs needed transformations to process the Drug Central data, additional information
on this data can be found in the comment at the top of this script"""
def run(self, species="Homo sapiens") -> None:
"""Method is called and performs needed transformations to process the Drug
Central data, additional information
on this data can be found in the comment at the top of this script"""

interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz")
interactions_file = os.path.join(self.input_base_dir,
"drug.target.interaction.tsv.gz")
os.makedirs(self.output_dir, exist_ok=True)
drug_node_type = "biolink:Drug"
gene_curie_prefix = "UniProtKB:"
drug_curie_prefix = "DrugCentral:"
gene_node_type = "biolink:Gene"
drug_gene_edge_label = "biolink:interacts_with"
drug_gene_edge_relation = "RO:0002436" # molecularly interacts with
Expand All @@ -52,21 +56,21 @@ def run(self) -> None:
for line in interactions:
items_dict = parse_drug_central_line(line, header_items)

if 'ORGANISM' not in items_dict or items_dict['ORGANISM'] != species:
continue

# get gene ID
try:
gene_id = get_item_by_priority(items_dict, ['ACCESSION'])
gene_id_string = get_item_by_priority(items_dict, ['ACCESSION'])
gene_ids = gene_id_string.split('|')
except ItemInDictNotFound:
# lines with no ACCESSION entry only contain drug info, no target
# info - not ingesting these
logging.info(
"No gene information for this line:\n{}\nskipping".format(line))
continue

# get drug ID
drug_id = get_item_by_priority(items_dict,
['ACT_SOURCE_URL',
'MOA_SOURCE_URL',
'DRUG_NAME'])
drug_id = drug_curie_prefix + get_item_by_priority(items_dict,
['DRUG_NAME'])

# WRITE NODES
# drug - ['id', 'name', 'category']
Expand All @@ -76,21 +80,23 @@ def run(self) -> None:
items_dict['DRUG_NAME'],
drug_node_type])

write_node_edge_item(fh=node,
header=self.node_header,
data=[gene_id,
items_dict['GENE'],
gene_node_type])

# WRITE EDGES
# ['subject', 'edge_label', 'object', 'relation', 'comment']
write_node_edge_item(fh=edge,
header=self.edge_header,
data=[drug_id,
drug_gene_edge_label,
gene_id,
drug_gene_edge_relation,
items_dict['ACT_COMMENT']])
for gene_id in gene_ids:
gene_id = gene_curie_prefix + gene_id
write_node_edge_item(fh=node,
header=self.node_header,
data=[gene_id,
items_dict['GENE'],
gene_node_type])

# WRITE EDGES
# ['subject', 'edge_label', 'object', 'relation', 'comment']
write_node_edge_item(fh=edge,
header=self.edge_header,
data=[drug_id,
drug_gene_edge_label,
gene_id,
drug_gene_edge_relation,
items_dict['ACT_COMMENT']])

return None

Expand All @@ -106,8 +112,9 @@ def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
item_dict: A dictionary of header items and a processed Drug Central string.
"""

items = this_line.strip().split("\t")
item_dict = dict(zip(header_items, items))
data = this_line.strip().split("\t")
data = [i.replace('"', '') for i in data]
item_dict = data_to_dict(header_items, data)

return item_dict

16 changes: 16 additions & 0 deletions tests/resources/drug.target.interaction_SNIPPET.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"DRUG_NAME" "STRUCT_ID" "TARGET_NAME" "TARGET_CLASS" "ACCESSION" "GENE" "SWISSPROT" "ACT_VALUE" "ACT_UNIT" "ACT_TYPE" "ACT_COMMENT" "ACT_SOURCE" "RELATION" "MOA" "MOA_SOURCE" "ACT_SOURCE_URL" "MOA_SOURCE_URL" "ACTION_TYPE" "TDL" "ORGANISM"
"levobupivacaine" 4 "Sodium channel protein type 4 subunit alpha" "Ion channel" "P35499" "SCN4A" "SCN4A_HUMAN" "WOMBAT-PK" 1 "CHEMBL" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749" "BLOCKER" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Cytochrome P450 2D6" "Enzyme" "P10635" "CYP2D6" "CP2D6_HUMAN" 6.706858517 "IC50" "DRUGMATRIX: CYP450, 2D6 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin)" "DRUG MATRIX" "=" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Potassium voltage-gated channel subfamily H member 2" "Ion channel" "Q12809" "KCNH2" "KCNH2_HUMAN" 4.89 "IC50" "Inhibition of wild-type human ERG channel expressed in HEK293 cells assessed as blockade of potassium tail current by whole-cell patch clamp technique" "CHEMBL" "=" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Potassium voltage-gated channel subfamily D member 3" "Ion channel" "Q9UK17" "KCND3" "KCND3_HUMAN" 4.5 "IC50" "WOMBAT-PK" "=" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Prostaglandin E2 receptor EP1 subtype" "GPCR" "P34995" "PTGER1" "PE2R1_HUMAN" "WOMBAT-PK" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Sodium channel protein type 1 subunit alpha" "Ion channel" "P35498" "SCN1A" "SCN1A_HUMAN" 5.79 "IC50" "WOMBAT-PK" "=" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "Potassium voltage-gated channel subfamily A member 5" "Ion channel" "P22460" "KCNA5" "KCNA5_HUMAN" "WOMBAT-PK" "Tclin" "Homo sapiens"
"levobupivacaine" 4 "5-hydroxytryptamine receptor 3A" "Ion channel" "P46098" "HTR3A" "5HT3A_HUMAN" "WOMBAT-PK" "Tclin" "Homo sapiens"
"(S)-nicardipine" 5 "Voltage-gated L-type calcium channel" "Ion channel" "Q01668|Q13936" "CACNA1C|CACNA1D" "CAC1C_HUMAN|CAC1D_HUMAN" "Mechanism of Action" "DRUG LABEL" 1 "DRUG LABEL" "http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf" "http://www.accessdata.fda.gov/drugsatfda_docs/label/2009/022276s003lbl.pdf" "BLOCKER" "Tclin|Tclin" "Homo sapiens"
"(S)-nitrendipine" 6 "Intermediate conductance calcium-activated potassium channel protein 4" "Ion channel" "O15554" "KCNN4" "KCNN4_HUMAN" 7.599999905 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "BLOCKER" "Tchem" "Homo sapiens"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1F" "Ion channel" "O60840" "CACNA1F" "CAC1F_HUMAN" 6 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "NEGATIVE MODULATOR" "Tchem" "Homo sapiens"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1C" "Ion channel" "P22002" "Cacna1c" "CAC1C_RAT" 6 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "NEGATIVE MODULATOR" "Rattus norvegicus"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1S" "Ion channel" "Q02485" "Cacna1s" "CAC1S_RAT" 6 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "NEGATIVE MODULATOR" "Rattus norvegicus"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1D" "Ion channel" "Q01668" "CACNA1D" "CAC1D_HUMAN" 8.39999961853027 "IC50" "Mechanism of Action" "IUPHAR" "=" 1 "IUPHAR" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "BLOCKER" "Tclin" "Homo sapiens"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1C" "Ion channel" "Q13936" "CACNA1C" "CAC1C_HUMAN" "Mechanism of Action" "SCIENTIFIC LITERATURE" 1 "SCIENTIFIC LITERATURE" "http://www.ncbi.nlm.nih.gov/pubmed/17276408" "BLOCKER" "Tclin" "Homo sapiens"
41 changes: 41 additions & 0 deletions tests/test_drug_central.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import unittest

from kg_covid_19.transform_utils.drug_central.drug_central import \
parse_drug_central_line
from kg_covid_19.utils.transform_utils import parse_header
from parameterized import parameterized


class TestDrugCentral(unittest.TestCase):

def setUp(self) -> None:
self.dti_fh = open('tests/resources/drug.target.interaction_SNIPPET.tsv', 'rt')

@parameterized.expand([
('STRUCT_ID', '4'),
('TARGET_NAME', 'Sodium channel protein type 4 subunit alpha'),
('TARGET_CLASS', 'Ion channel'),
('ACCESSION', 'P35499'),
('GENE', 'SCN4A'),
('SWISSPROT', 'SCN4A_HUMAN'),
('ACT_VALUE', ''),
('ACT_UNIT', ''),
('ACT_TYPE', ''),
('ACT_COMMENT', ''),
('ACT_SOURCE', 'WOMBAT-PK'),
('RELATION', ''),
('MOA', '1'),
('MOA_SOURCE', 'CHEMBL'),
('ACT_SOURCE_URL', ''),
('MOA_SOURCE_URL', 'https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1200749'),
('ACTION_TYPE', 'BLOCKER'),
('TDL', 'Tclin'),
('ORGANISM', 'Homo sapiens')])
def test_parse_drug_central_line(self, key, value):
header = parse_header(self.dti_fh.readline())
line = self.dti_fh.readline()
parsed = parse_drug_central_line(line, header)
self.assertTrue(key in parsed)
self.assertEqual(value, parsed[key])


0 comments on commit 487136a

Please sign in to comment.