Skip to content

Commit

Permalink
Merge 5577772 into 985eab2
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed May 11, 2020
2 parents 985eab2 + 5577772 commit f15d6bf
Show file tree
Hide file tree
Showing 4 changed files with 1,114 additions and 32 deletions.
9 changes: 3 additions & 6 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,18 @@
url: https://s3.eu-west-2.amazonaws.com/scibite.covid19data/CORD19/annotated-cord19/CORD-19_1_3.zip
local_name: CORD-19_1_3.zip

# SciBite CORD-19 entity co-occurrences v1.0
# -
# url: https://github.com/SciBiteLabs/CORD19/raw/master/sentence-co-occurrence-CORD-19/1.0/cv19_scc.zip
# local_name: cv19_scc.zip

#
# PharmGKB - drug -> drug target info, and gene id mappings
#
-
url: https://api.pharmgkb.org/v1/download/file/data/relationships.zip
local_name: relationships.zip

-
url: https://s3.pgkb.org/data/genes.zip
local_name: pharmgkb_genes.zip
-
url: https://s3.pgkb.org/data/drugs.zip
local_name: pharmgkb_drugs.zip

# SARS-CoV-2 gene annotations from Uniprot
#
Expand Down
116 changes: 93 additions & 23 deletions kg_covid_19/transform_utils/pharmgkb/pharmgkb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@
# -*- coding: utf-8 -*-
import logging
import os
import re
import tempfile
from collections import defaultdict
from io import TextIOBase
from typing import Optional, TextIO

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils.transform_utils import data_to_dict, parse_header, \
unzip_to_tempdir, write_node_edge_item, get_item_by_priority
unzip_to_tempdir, write_node_edge_item, get_item_by_priority, ItemInDictNotFound

"""Ingest PharmGKB drug -> drug target info
Expand Down Expand Up @@ -43,11 +44,13 @@ def run(self, data_file: Optional[str] = None):
relationship_file_name = "relationships.tsv"
gene_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_genes.zip")
gene_mapping_file_name = "genes.tsv"
drug_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_drugs.zip")
drug_mapping_file_name = "drugs.tsv"

#
# file stuff
#
# get relationship file (what we are ingest here)
# get relationship file (what we are ingesting here)
# TODO: unlink relationship_tempdir and gene_id_tempdir

relationship_tempdir = tempfile.mkdtemp()
Expand All @@ -64,8 +67,17 @@ def run(self, data_file: Optional[str] = None):
unzip_to_tempdir(gene_mapping_zip_file, gene_id_tempdir)
if not os.path.exists(gene_mapping_file_path):
raise PharmGKBFileError("Can't find gene map file needed for ingest")
self.gene_id_map = self.make_id_mapping_file(gene_mapping_file_path)

self.gene_id_map = self.make_gene_id_mapping_file(gene_mapping_file_path)
# get mapping file for drug ids
drug_id_tempdir = tempfile.mkdtemp()
drug_mapping_file_path = os.path.join(drug_id_tempdir,
drug_mapping_file_name)
unzip_to_tempdir(drug_mapping_zip_file, drug_id_tempdir)

if not os.path.exists(drug_mapping_file_path):
raise PharmGKBFileError("Can't find drug map file needed for ingest")
self.drug_id_map = self.make_id_mapping_file(drug_mapping_file_path)

#
# read in and transform relationship.tsv
Expand Down Expand Up @@ -115,6 +127,61 @@ def run(self, data_file: Optional[str] = None):
self.make_pharmgkb_edge(fh=edge,
line_data=line_data)


def make_preferred_drug_id(self, pharmgkb_id: str,
drug_id_map: dict,
preferred_ids: dict={'ChEBI:CHEBI': 'CHEBI',
'CHEMBL': 'CHEMBL',
'DrugBank': 'DRUGBANK',
'PubChem Compound:': 'PUBCHEM'},
pharmgkb_prefix: str='PHARMGKB') \
-> str:
"""Given a drug id, convert it to a cross-referenced ID, in this order of
preference:
CHEBI > CHEMBL > DRUGBANK > PUBCHEM
:param pharmgkb_id
:param drug_id_map - map of pharmgkb ids to cross-referenced IDs
:param preferred_ids - dict of preferred ids in desc order of preference
'their string' -> 'canonical CURIE prefix'
wow, they don't make this easy
:param pharmgkb_prefix thing to prepend to pharmgkb id ('PHARMGKB')
:return: preferred_id: preferred cross-referenced ID
"""
preferred_id = pharmgkb_prefix + ":" + pharmgkb_id
if pharmgkb_id in drug_id_map:
if 'Cross-references' not in drug_id_map[pharmgkb_id]:
logging.warning("Can't find 'Cross-references' item in drug_id_map! "
"Was it renamed?")
elif not drug_id_map[pharmgkb_id]['Cross-references']:
# 'Cross-references' is empty
pass
else:
map_string = drug_id_map[pharmgkb_id]['Cross-references']

# the following makes an atrocious string like
# '"PREFIX1:1234', 'PREFIX2:3456'
# into a dict I can pass to get_item_by_priority to look for preferred
# ID
these_cr_ids = map_string.split(",")
these_cr_ids_dict: dict = defaultdict()
for this_id in these_cr_ids:
this_id = re.sub(r'^"|"$', '', this_id) # strip quotes
items = this_id.rpartition(':')
if len(items) >= 3:
these_cr_ids_dict[items[0]] = items[2]

for pharmgkb_prefix, curie_prefix in preferred_ids.items():
try:
this_id = get_item_by_priority(these_cr_ids_dict, [pharmgkb_prefix])
preferred_id = curie_prefix + ":" + this_id
break
except ItemInDictNotFound:
pass

return preferred_id


def make_pharmgkb_edge(self,
fh: TextIO,
line_data: dict
Expand All @@ -135,14 +202,16 @@ def make_pharmgkb_edge(self,
gene_id = self.get_uniprot_id(this_id=gene_id)

evidence = line_data['Evidence']

preferred_drug_id = self.make_preferred_drug_id(drug_id, self.drug_id_map)

data = [preferred_drug_id, self.drug_gene_edge_label,
gene_id, self.drug_gene_edge_relation,
self.source_name, evidence]

write_node_edge_item(fh=fh,
header=self.edge_header,
data=[drug_id,
self.drug_gene_edge_label,
gene_id,
self.drug_gene_edge_relation,
self.source_name,
evidence])
data=data)

def make_pharmgkb_gene_node(self,
fh: TextIO,
Expand All @@ -162,12 +231,13 @@ def make_pharmgkb_gene_node(self,
write_node_edge_item(fh=fh, header=self.node_header, data=data)

def get_uniprot_id(self,
this_id: str):
this_id: str,
pharmgkb_prefix: str='PHARMGKB'):
try:
gene_id = self.uniprot_curie_prefix + \
self.gene_id_map[this_id][self.key_parsed_ids][self.uniprot_id_key]
except KeyError:
gene_id = this_id
gene_id = pharmgkb_prefix + ":" + this_id
return gene_id

def make_pharmgkb_chemical_node(self,
Expand All @@ -183,7 +253,8 @@ def make_pharmgkb_chemical_node(self,
:param biolink_type: biolink type for Chemical
:return: None
"""
data = [chem_id, name, biolink_type]
preferred_drug_id = self.make_preferred_drug_id(chem_id, self.drug_id_map)
data = [preferred_drug_id, name, biolink_type]
write_node_edge_item(fh=fh, header=self.node_header, data=data)

def parse_pharmgkb_line(self, this_line: str, header_items) -> dict:
Expand All @@ -196,17 +267,16 @@ def parse_pharmgkb_line(self, this_line: str, header_items) -> dict:
items = this_line.strip().split('\t')
return data_to_dict(header_items, items)

def make_gene_id_mapping_file(self,
map_file: str,
sep: str = '\t',
pharmgkb_id_col: str = 'PharmGKB Accession Id',
id_key: str = 'Cross-references',
id_sep: str = ',',
id_key_val_sep: str = ':'
) -> dict:
"""Fxn to parse gene mappings for PharmGKB ids
What I need is PharmGKB -> uniprot ids, but this parses everything
They don't make this easy...
def make_id_mapping_file(self,
map_file: str,
sep: str = '\t',
pharmgkb_id_col: str = 'PharmGKB Accession Id',
id_key: str = 'Cross-references',
id_sep: str = ',',
id_key_val_sep: str = ':'
) -> dict:
"""Fxn to parse gene ID mappings or drug ID mapping for PharmGKB ids
This is to parse both genes.tsv and drugs.tsv files
:param map_file: genes.tsv file, containing mappings
:param pharmgkb_id_col: column containing pharmgkb, to be used as key for map
Expand Down
Loading

0 comments on commit f15d6bf

Please sign in to comment.