Refactor ingest to deal with multiple proteins in one line

Knowledge-Graph-Hub · Jun 17, 2020 · e28afa8 · e28afa8
1 parent 476913a
commit e28afa8
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 9 deletions.
diff --git a/kg_covid_19/transform_utils/drug_central/drug_central.py b/kg_covid_19/transform_utils/drug_central/drug_central.py
@@ -66,21 +66,21 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
 
                 # get protein ID
                 try:
-                    protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
-                    protein_ids = protein_ids_string.split('|')
+                    protein_dict = items_dict_to_protein_data_dict(items_dict)
 
-                    if items_dict['DRUG_NAME'] == 'acetyldigitoxin':
-                        pass
                 except ItemInDictNotFound:
                     # lines with no ACCESSION entry only contain drug info, no target
                     # info - not ingesting these
                     continue
+                except ValueError:
+                    logging.error("Value error while parsing line")
+                    continue
 
                 # get drug ID
                 drug_id = drug_curie_prefix + get_item_by_priority(items_dict,
                                                                    ['STRUCT_ID'])
 
-                # WRITE NODES
+                # Write drug node
                 write_node_edge_item(fh=node,
                                      header=self.node_header,
                                      data=[drug_id,
@@ -89,15 +89,15 @@ def run(self, data_file: Optional[str] = None, species: str = "Homo sapiens") ->
                                            '',  # TDL (not applicable for drugs)
                                            self.source_name])
 
-                for protein_id in protein_ids:
-                    protein_id = uniprot_curie_prefix + protein_id
+                for key, (uniprot_id, name, tdl) in protein_dict.items():
+                    protein_id = uniprot_curie_prefix + uniprot_id
 
                     write_node_edge_item(fh=node,
                                          header=self.node_header,
                                          data=[protein_id,
-                                               items_dict['GENE'],
+                                               name,
                                                protein_node_type,
-                                               items_dict['TDL'],
+                                               tdl,
                                                self.source_name])
 
                     # WRITE EDGES
@@ -130,3 +130,29 @@ def parse_drug_central_line(this_line: str, header_items: List) -> Dict:
 
     return item_dict
 
+
+def items_dict_to_protein_data_dict(items_dict: dict) -> dict:
+    """Given a parsed line from parse_drug_central_line, split up pipe-separated entries
+    for several related proteins and their names and TDL info into separate protein
+    entries
+
+    :param items_dict: dictionary of data from a line, output by parse_drug_central_line
+    :return: a dict with information about each protein
+    """
+    protein_ids_string = get_item_by_priority(items_dict, ['ACCESSION'])
+    protein_ids = protein_ids_string.split('|')
+    gene_name = get_item_by_priority(items_dict, ['GENE']).split('|')
+    TDL_values = get_item_by_priority(items_dict, ['TDL']).split('|')
+
+    if len(protein_ids) != len(gene_name):
+        logging.warning("Didn't get the same number of entries for protein_ids and gene_ids")
+        gene_name = [''] * len(protein_ids)
+
+    if len(protein_ids) != len(TDL_values):
+        # this happens - repeat TDL designation for all protein IDs
+        TDL_values = TDL_values * len(protein_ids)
+
+    protein_dict = defaultdict(list)
+    for i in range(len(protein_ids)):
+        protein_dict[protein_ids[i]] = [protein_ids[i], gene_name[i], TDL_values[i]]
+    return protein_dict
diff --git a/tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv b/tests/resources/drug_central/drug.target.interaction_SNIPPET.tsv
@@ -14,3 +14,4 @@
 "(S)-nitrendipine"	6	"Voltage-dependent L-type calcium channel subunit alpha-1S"	"Ion channel"	"Q02485"	"Cacna1s"	"CAC1S_RAT"	6		"IC50"		"IUPHAR"	"="			"http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334"		"NEGATIVE MODULATOR"		"Rattus norvegicus"
 "(S)-nitrendipine"	6	"Voltage-dependent L-type calcium channel subunit alpha-1D"	"Ion channel"	"Q01668"	"CACNA1D"	"CAC1D_HUMAN"	8.39999961853027		"IC50"	"Mechanism of Action"	"IUPHAR"	"="	1	"IUPHAR"	"http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334"	"http://www.guidetopharmacology.org/GRAC/LigandDisplayForward?ligandId=2334"	"BLOCKER"	"Tclin"	"Homo sapiens"
 "(S)-nitrendipine"	6	"Voltage-dependent L-type calcium channel subunit alpha-1C"	"Ion channel"	"Q13936"	"CACNA1C"	"CAC1C_HUMAN"				"Mechanism of Action"	"SCIENTIFIC LITERATURE"		1	"SCIENTIFIC LITERATURE"		"http://www.ncbi.nlm.nih.gov/pubmed/17276408"	"BLOCKER"	"Tclin"	"Homo sapiens"
+"acamprosate"	38	"Glutamate [NMDA] receptor"	"Ion channel"	"O15399|O60391|Q05586|Q12879|Q13224|Q14957|Q8TCU5"	"GRIN1|GRIN2A|GRIN2B|GRIN2C|GRIN2D|GRIN3A|GRIN3B"	"NMD3A_HUMAN|NMD3B_HUMAN|NMDE1_HUMAN|NMDE2_HUMAN|NMDE3_HUMAN|NMDE4_HUMAN|NMDZ1_HUMAN"				"Mechanism of Action; CHEMBL2094124; PROTEIN COMPLEX GROUP"	"CHEMBL"		1	"CHEMBL"	"https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293"	"https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL1201293"	"ANTAGONIST"	"Tclin|Tclin|Tclin|Tclin|Tclin|Tclin|Tclin"	"Homo sapiens"