Skip to content

Commit

Permalink
Refactor ingest to deal with multiple proteins in one line
Browse files Browse the repository at this point in the history
  • Loading branch information
justaddcoffee committed Jun 17, 2020
1 parent 476913a commit e28afa8
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
44 changes: 35 additions & 9 deletions kg_covid_19/transform_utils/drug_central/drug_central.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,21 +66,21 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->

# get protein ID
try:
protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
protein_ids = protein_ids_string.split('|')
protein_dict = items_dict_to_protein_data_dict(items_dict)

if items_dict['DRUG_NAME'] == 'acetyldigitoxin':
pass
except ItemInDictNotFound:
# lines with no ACCESSION entry only contain drug info, no target
# info - not ingesting these
continue
except ValueError:
logging.error("Value error while parsing line")
continue

# get drug ID
drug_id = drug_curie_prefix + get_item_by_priority(items_dict,
['STRUCT_ID'])

# WRITE NODES
# Write drug node
write_node_edge_item(fh=node,
header=self.node_header,
data=[drug_id,
Expand All @@ -89,15 +89,15 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
'', # TDL (not applicable for drugs)
self.source_name])

for protein_id in protein_ids:
protein_id = uniprot_curie_prefix + protein_id
for key, (uniprot_id, name, tdl) in protein_dict.items():
protein_id = uniprot_curie_prefix + uniprot_id

write_node_edge_item(fh=node,
header=self.node_header,
data=[protein_id,
items_dict['GENE'],
name,
protein_node_type,
items_dict['TDL'],
tdl,
self.source_name])

# WRITE EDGES
Expand Down Expand Up @@ -130,3 +130,29 @@ def parse_drug_central_line(this_line: str, header_items: List) -> Dict:

return item_dict


def items_dict_to_protein_data_dict(items_dict: dict) -> dict:
"""Given a parsed line from parse_drug_central_line, split up pipe-separated entries
for several related proteins and their names and TDL info into separate protein
entries
:param items_dict: dictionary of data from a line, output by parse_drug_central_line
:return: a dict with information about each protein
"""
protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
protein_ids = protein_ids_string.split('|')
gene_name = get_item_by_priority(items_dict, ['GENE']).split('|')
TDL_values = get_item_by_priority(items_dict, ['TDL']).split('|')

if len(protein_ids) != len(gene_name):
logging.warning("Didn't get the same number of entries for protein_ids and gene_ids")
gene_name = [''] * len(protein_ids)

if len(protein_ids) != len(TDL_values):
# this happens - repeat TDL designation for all protein IDs
TDL_values = TDL_values * len(protein_ids)

protein_dict = defaultdict(list)
for i in range(len(protein_ids)):
protein_dict[protein_ids[i]] = [protein_ids[i], gene_name[i], TDL_values[i]]
return protein_dict
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1S" "Ion channel" "Q02485" "Cacna1s" "CAC1S_RAT" 6 "IC50" "IUPHAR" "=" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "NEGATIVE MODULATOR" "Rattus norvegicus"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1D" "Ion channel" "Q01668" "CACNA1D" "CAC1D_HUMAN" 8.39999961853027 "IC50" "Mechanism of Action" "IUPHAR" "=" 1 "IUPHAR" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334" "BLOCKER" "Tclin" "Homo sapiens"
"(S)-nitrendipine" 6 "Voltage-dependent L-type calcium channel subunit alpha-1C" "Ion channel" "Q13936" "CACNA1C" "CAC1C_HUMAN" "Mechanism of Action" "SCIENTIFIC LITERATURE" 1 "SCIENTIFIC LITERATURE" "http://www.ncbi.nlm.nih.gov/pubmed/17276408" "BLOCKER" "Tclin" "Homo sapiens"
"acamprosate" 38 "Glutamate [NMDA] receptor" "Ion channel" "O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5" "GRIN1|GRIN2A|GRIN2B|GRIN2C|GRIN2D|GRIN3A|GRIN3B" "NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMAN|NMDE3_HUMAN|NMDE4_HUMAN|NMDZ1_HUMAN" "Mechanism of Action; CHEMBL2094124; PROTEIN COMPLEX GROUP" "CHEMBL" 1 "CHEMBL" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293" "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293" "ANTAGONIST" "Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin" "Homo sapiens"

0 comments on commit e28afa8

Please sign in to comment.