Skip to content

Commit

Permalink
Merge pull request #199 from Knowledge-Graph-Hub/refactor_tclin_tchem…
Browse files Browse the repository at this point in the history
…_info

Use TDL column for Tclin, Tchem, Tbio in Drug Central ingest
  • Loading branch information
deepakunni3 committed Jun 19, 2020
2 parents e6d1f87 + c9d3ab8 commit b9ad54e
Show file tree
Hide file tree
Showing 9 changed files with 143 additions and 196 deletions.
25 changes: 0 additions & 25 deletions .github/ISSUE_TEMPLATE/bug-report.md

This file was deleted.

15 changes: 4 additions & 11 deletions .github/ISSUE_TEMPLATE/dataset-request.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,14 @@ assignees: ''

---

####Name of the dataset
A clear and concise description of the dataset
**Name of the dataset**
A clear and concise description of the dataset and where you can download it from.

####URL
A URL from which the data can be retrieved

###Mapping or relevant fields
**Mapping or relevant fields**
A clear and concise description of what which fields you would want to be ingested.

If possible, highlight which fields map to nodes and which fields map to edges.
Refer to [Data Preparation](https://github.com/NCATS-Tangerine/kgx/blob/master/data-preparation.md) for guidelines on how the final transformed data should be represented.

###Usefulness
A description of what the transformed data might be useful for - in particular, what
value it might add to ML efforts and/or what useful queries it might allow.

##Additional context
**Additional context**
Add any other context, requests, concerns.
22 changes: 20 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,27 @@ kg-covid-19

KG hub to produce a knowledge graph for COVID-19 and SARS-COV-2

Documentation
See the `repository's wiki for more details <https://github.com/kg-emerging-viruses/kg-emerging-viruses/wiki>`_.

How do I install this package?
----------------------------------------------

.. code:: shell
git clone https://github.com/Knowledge-Graph-Hub/kg-covid-19
cd kg-covid-19
pip install .
pip install -r requirements.txt
How do I use this package?
----------------------------------------------
See the `repository's wiki for information about this project <https://github.com/kg-emerging-viruses/kg-emerging-viruses/wiki>`_.

.. code:: shell
python run.py download
python run.py transform
python run.py merge
Tests Coverage
----------------------------------------------
Expand Down
150 changes: 69 additions & 81 deletions kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
Essentially just ingests and transforms this file:
http://unmtid-shinyapps.net/download/drug.target.interaction.tsv.gz
And extracts Drug -> Gene interactions
And extracts Drug -> Protein interactions
"""


Expand All @@ -30,33 +30,28 @@ class DrugCentralTransform(Transform):
def __init__(self, input_dir: str = None, output_dir: str = None) -> None:
source_name = "drug_central"
super().__init__(source_name, input_dir, output_dir) # set some variables
self.node_header = ['id', 'name', 'category', 'tclin', 'tchem', 'provided_by']
self.node_header = ['id', 'name', 'category', 'TDL', 'provided_by']

def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") -> None:
def run(self, data_file: Optional[str] = None,
species: str = "Homo sapiens") -> None:
"""Method is called and performs needed transformations to process the Drug
Central data, additional information
on this data can be found in the comment at the top of this script"""

if data_file is None:
data_file = "drug.target.interaction.tsv.gz"
interactions_file = os.path.join(self.input_base_dir,
"drug.target.interaction.tsv.gz")
tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip")
data_file)
os.makedirs(self.output_dir, exist_ok=True)
drug_node_type = "biolink:Drug"
gene_curie_prefix = "UniProtKB:"
uniprot_curie_prefix = "UniProtKB:"
drug_curie_prefix = "DrugCentral:"
gene_node_type = "biolink:Gene"
drug_gene_edge_label = "biolink:interacts_with"
drug_gene_edge_relation = "RO:0002436" # molecularly interacts with
protein_node_type = "biolink:Protein"
drug_protein_edge_label = "biolink:molecularly_interacts_with"
drug_protein_edge_relation = "RO:0002436" # molecularly interacts with
self.edge_header = ['subject', 'edge_label', 'object', 'relation',
'provided_by', 'comment']

# unzip tcrd.zip and get tchem and tclin filenames
tempdir = tempfile.mkdtemp()
(tclin_file, tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir)

tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot')
tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot')

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge, \
gzip.open(interactions_file, 'rt') as interactions:
Expand All @@ -66,101 +61,68 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->

header_items = parse_header(interactions.readline())

seen_proteins: dict = defaultdict(int)
seen_drugs: dict = defaultdict(int)

for line in interactions:
items_dict = parse_drug_central_line(line, header_items)

if 'ORGANISM' not in items_dict or items_dict['ORGANISM'] != species:
continue

# get gene ID
# get protein ID
try:
gene_id_string = get_item_by_priority(items_dict, ['ACCESSION'])
gene_ids = gene_id_string.split('|')
protein_dict = items_dict_to_protein_data_dict(items_dict)

except ItemInDictNotFound:
# lines with no ACCESSION entry only contain drug info, no target
# info - not ingesting these
continue
except ValueError:
logging.error("Value error while parsing line")
continue

# get drug ID
drug_id = drug_curie_prefix + get_item_by_priority(items_dict,
['STRUCT_ID'])

# WRITE NODES
# drug - ['id', 'name', 'category']
write_node_edge_item(fh=node,
header=self.node_header,
data=[drug_id,
items_dict['DRUG_NAME'],
drug_node_type,
str(False),
str(False),
self.source_name])

for gene_id in gene_ids:
gene_id = gene_curie_prefix + gene_id
is_tclin = True if gene_ids[0] in tclin_dict else False
is_tchem = True if gene_ids[0] in tchem_dict else False

# Write drug node
if drug_id not in seen_drugs:
write_node_edge_item(fh=node,
header=self.node_header,
data=[gene_id,
items_dict['GENE'],
gene_node_type,
str(is_tclin),
str(is_tchem),
data=[drug_id,
items_dict['DRUG_NAME'],
drug_node_type,
'', # TDL (not applicable for drugs)
self.source_name])
seen_drugs[drug_id] += 1

for key, (uniprot_id, name, tdl) in protein_dict.items():
protein_id = uniprot_curie_prefix + uniprot_id

if protein_id not in seen_proteins:
write_node_edge_item(fh=node,
header=self.node_header,
data=[protein_id,
name,
protein_node_type,
tdl,
self.source_name])
seen_proteins[protein_id] += 1

# WRITE EDGES
# ['subject', 'edge_label', 'object', 'relation', 'provided_by',
# 'comment']
write_node_edge_item(fh=edge,
header=self.edge_header,
data=[drug_id,
drug_gene_edge_label,
gene_id,
drug_gene_edge_relation,
drug_protein_edge_label,
protein_id,
drug_protein_edge_relation,
self.source_name,
items_dict['ACT_COMMENT']])

return None


def tsv_to_dict(input_file: str, col_for_key: str) -> dict:
this_dict: dict = defaultdict(list)
with open(input_file) as file:
reader = csv.DictReader(file, delimiter='\t')
for row in reader:
this_dict[row[col_for_key]] = row
return this_dict


def unzip_and_get_tclin_tchem(zip_file: str, output_dir: str) -> List[str]:
unzip_to_tempdir(zip_file, output_dir)
# get tclin filename
tclin_files = \
[f for f in os.listdir(output_dir) if re.match(r'tclin_.*\.tsv', f)]
if len(tclin_files) > 1:
raise RuntimeError("Found more than one tclin file:\n%s" %
"\n".join(tclin_files))
elif len(tclin_files) < 1:
raise RuntimeError("Couldn't find tclin file in zipfile %s" % zip_file)
else:
tclin_file: str = os.path.join(output_dir, tclin_files[0])

# get tchem filename
tchem_files = \
[f for f in os.listdir(output_dir) if re.match(r'tchem_.*\.tsv', f)]
if len(tchem_files) > 1:
raise RuntimeError("Found more than one tchem file:\n%s" %
"\n".join(tchem_files))
elif len(tchem_files) < 1:
raise RuntimeError("Couldn't find tchem file in zipfile %s" % zip_file)
else:
tchem_file: str = os.path.join(output_dir, tchem_files[0])

return [tclin_file, tchem_file]


def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
"""Methods processes a line of text from Drug Central.
Expand All @@ -178,3 +140,29 @@ def parse_drug_central_line(this_line: str, header_items: List) -> Dict:

return item_dict


def items_dict_to_protein_data_dict(items_dict: dict) -> dict:
"""Given a parsed line from parse_drug_central_line, split up pipe-separated entries
for several related proteins and their names and TDL info into separate protein
entries
:param items_dict: dictionary of data from a line, output by parse_drug_central_line
:return: a dict with information about each protein
"""
protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
protein_ids = protein_ids_string.split('|')
gene_name = get_item_by_priority(items_dict, ['GENE']).split('|')
TDL_values = get_item_by_priority(items_dict, ['TDL']).split('|')

if len(protein_ids) != len(gene_name):
logging.warning("Didn't get the same number of entries for protein_ids and gene_ids")
gene_name = [''] * len(protein_ids)

if len(protein_ids) != len(TDL_values):
# this happens - repeat TDL designation for all protein IDs
TDL_values = TDL_values * len(protein_ids)

protein_dict = defaultdict(list)
for i in range(len(protein_ids)):
protein_dict[protein_ids[i]] = [protein_ids[i], gene_name[i], TDL_values[i]]
return protein_dict
12 changes: 2 additions & 10 deletions kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import gzip
import logging
import zipfile
from typing import Any, Dict, List, Union, TextIO
from typing import Any, Dict, List, Union
from tqdm import tqdm # type: ignore


Expand Down Expand Up @@ -63,23 +63,15 @@ def get_header_items(table_data: Any) -> List:
return header_items


def write_node_edge_item(fh: TextIO, header: List, data: List, sep: str = '\t',
sanitize_sep_char=True):
def write_node_edge_item(fh: Any, header: List, data: List, sep: str = '\t'):
"""Write out a single line for a node or an edge in *.tsv
:param fh: file handle of node or edge file
:param header: list of header items
:param data: data for line to write out
:param sep: separator [\t]
:param sanitize_sep_char: replace sep character in data with hex
present in `data`
"""
if len(header) != len(data):
raise Exception('Header and data are not the same length.')

if sanitize_sep_char:
for i in range(len(data)):
data[i] = data[i].replace(sep, hex(ord(sep)))

try:
fh.write(sep.join(data) + "\n")
except IOError:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1S" "Ion channel" "Q02485" "Cacna1s" "CAC1S_RAT" 6 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "NEGATIVE MODULATOR" "Rattus norvegicus"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1D" "Ion channel" "Q01668" "CACNA1D" "CAC1D_HUMAN" 8.39999961853027 "IC50" "Mechanism of Action" "IUPHAR" "=" 1 "IUPHAR" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "BLOCKER" "Tclin" "Homo sapiens"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1C" "Ion channel" "Q13936" "CACNA1C" "CAC1C_HUMAN" "Mechanism of Action" "SCIENTIFIC LITERATURE" 1 "SCIENTIFIC LITERATURE" "http://www.ncbi.nlm.nih.gov/pubmed/17276408" "BLOCKER" "Tclin" "Homo sapiens"
"acamprosate" 38 "Glutamate [NMDA] receptor" "Ion channel" "O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5" "GRIN1|GRIN2A|GRIN2B|GRIN2C|GRIN2D|GRIN3A|GRIN3B" "NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMAN|NMDE3_HUMAN|NMDE4_HUMAN|NMDZ1_HUMAN" "Mechanism of Action; CHEMBL2094124; PROTEIN COMPLEX GROUP" "CHEMBL" 1 "CHEMBL" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293" "ANTAGONIST" "Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin" "Homo sapiens"
Binary file not shown.
Loading

0 comments on commit b9ad54e

Please sign in to comment.