-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #76 from Knowledge-Graph-Hub/sars_cov2_gene_annota…
…tions SARS-CoV-2 gene annotations
- Loading branch information
Showing
10 changed files
with
355 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
kg_covid_19/transform_utils/sars_cov_2_gene_annot/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .sars_cov_2_gene_annot import SARSCoV2GeneAnnot | ||
|
||
__all__ = [ | ||
"SARSCoV2GeneAnnot" | ||
] |
210 changes: 210 additions & 0 deletions
210
kg_covid_19/transform_utils/sars_cov_2_gene_annot/sars_cov_2_gene_annot.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
import logging | ||
import os | ||
from typing import Generator, TextIO, List | ||
|
||
from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound | ||
|
||
from kg_covid_19.transform_utils.transform import Transform | ||
from kg_covid_19.utils import write_node_edge_item | ||
|
||
"""Parse the GPA and GPI files for SARS-CoV-2 gene annotations, including GO annotations | ||
and more. Make a node for each gene using GPI lines, and make an edge for | ||
each gene -> annotation described in GPA | ||
""" | ||
|
||
|
||
class SARSCoV2GeneAnnot(Transform): | ||
|
||
def __init__(self, input_dir: str = None, output_dir: str = None): | ||
source_name = "sars_cov_2_gene_annot" | ||
super().__init__(source_name, input_dir, output_dir) | ||
|
||
self.node_header = ['id', 'name', 'category', 'synonym', 'taxon'] | ||
self.edge_header = ['subject', 'edge_label', 'object', 'relation', | ||
'DB_References', 'ECO_code', 'With', 'Interacting_taxon_ID', | ||
'Date', 'Assigned_by', 'Annotation_Extension', | ||
'Annotation_Properties'] | ||
|
||
self.protein_node_type = "biolink:Protein" | ||
self.ncbi_taxon_prefix = "NCBITaxon:" | ||
|
||
# translate edge labels to RO term, for the 'relation' column in edge | ||
self.edge_label_prefix = "biolink:" # prepend to edge label | ||
self.edge_label_to_RO_term: dict = { | ||
'enables': 'RO:0002327', | ||
'involved_in': 'RO:0002331', | ||
'part_of': 'BFO:0000050' | ||
} | ||
|
||
def run(self): | ||
|
||
# file housekeeping | ||
os.makedirs(self.output_dir, exist_ok=True) | ||
|
||
gpi_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpi") | ||
gpa_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpa") | ||
|
||
with open(self.output_node_file, 'w') as node, \ | ||
open(self.output_edge_file, 'w') as edge: | ||
|
||
# write headers | ||
node.write("\t".join(self.node_header) + "\n") | ||
edge.write("\t".join(self.edge_header) + "\n") | ||
|
||
with open(gpi_file, 'r') as gpi_fh: | ||
for rec in _gpi12iterator(gpi_fh): | ||
node_data = self.gpi_to_gene_node_data(rec) | ||
write_node_edge_item(node, self.node_header, node_data) | ||
|
||
with open(gpa_file, 'r') as gpa_fh: | ||
for rec in _gpa11iterator(gpa_fh): | ||
edge_data = self.gpa_to_edge_data(rec) | ||
write_node_edge_item(edge, self.edge_header, edge_data) | ||
|
||
def gpa_to_edge_data(self, rec: dict) -> list: | ||
"""given a parsed gpa entry, return an edge with the annotations | ||
:param rec: record from gpa iterator | ||
:return: | ||
""" | ||
subj: str = self._rec_to_id(rec) | ||
edge_label: str = get_item_by_priority(rec, ['Qualifier'])[0] | ||
obj: str = get_item_by_priority(rec, ['GO_ID']) | ||
try: | ||
relation: str = self.edge_label_to_RO_term[edge_label] | ||
except KeyError: | ||
relation = '' | ||
|
||
edge_data = [subj, self.edge_label_prefix + edge_label, obj, relation] | ||
# all the others | ||
for key in ['DB:Reference', 'ECO_Evidence_code', 'With', 'Interacting_taxon_ID', | ||
'Date', 'Assigned_by', 'Annotation_Extension', | ||
'Annotation_Properties']: | ||
try: | ||
item = get_item_by_priority(rec, [key]) | ||
if type(item) is list: | ||
item = item[0] | ||
except (ItemInDictNotFound, IndexError): | ||
item = '' | ||
edge_data.append(item) | ||
return edge_data | ||
|
||
def _rec_to_id(self, rec: dict) -> str: | ||
try: | ||
this_id: str = get_item_by_priority(rec, ['DB']) + ":" + \ | ||
get_item_by_priority(rec, ['DB_Object_ID']) | ||
except ItemInDictNotFound: | ||
logging.error("Can't make ID for record: %s", "\t".join(rec)) | ||
this_id = '' | ||
return this_id | ||
|
||
def gpi_to_gene_node_data(self, rec: dict) -> list: | ||
"""given a parsed gpi entry, return a node that can be passed to | ||
write_node_edge_item() | ||
:param rec: record from gpi iterator | ||
:return: list of node items, one for each thing in self.node_header | ||
""" | ||
# ['id', 'name', 'category', 'synonym', 'taxon'] | ||
id: str = self._rec_to_id(rec) | ||
|
||
try: | ||
name_list = get_item_by_priority(rec, ['DB_Object_Name']) | ||
if name_list is not None and len(name_list) > 0: | ||
name = name_list[0] | ||
else: | ||
name = '' | ||
except (IndexError, ItemInDictNotFound): | ||
name = '' | ||
|
||
category = self.protein_node_type | ||
try: | ||
synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])[0] | ||
except (IndexError, ItemInDictNotFound): | ||
synonym = '' | ||
taxon = get_item_by_priority(rec, ['Taxon']) | ||
return [id, name, category, synonym, taxon] | ||
|
||
|
||
def _gpi12iterator(handle: TextIO) -> Generator: | ||
# Partly cribbed from Biopython GPI 1.1 parser | ||
# There is no GPI 1.2 parser yet, so I made this | ||
"""Read GPI 1.2 format files (PRIVATE). | ||
This iterator is used to read a gp_information.goa_uniprot | ||
file which is in the GPI 1.2 format. | ||
""" | ||
logging.getLogger().setLevel(logging.WARNING) | ||
# GPI version 1.2 | ||
GPI11FIELDS = [ | ||
"DB", | ||
"DB_Object_ID", | ||
"DB_Object_Symbol", | ||
"DB_Object_Name", | ||
"DB_Object_Synonym", | ||
"DB_Object_Type", | ||
"Taxon", | ||
"Parent_Object_ID", | ||
"DB_Xref", | ||
"Properties", | ||
] | ||
for inline in handle: | ||
if inline[0] == "!": | ||
continue | ||
inrec: List[str] = inline.rstrip("\n").split("\t") | ||
if len(inrec) == 1: | ||
continue | ||
# DB_Object_Name | ||
inrec[2] = inrec[2].split("|") # type: ignore | ||
# DB_Object_Synonym(s) | ||
inrec[3] = inrec[3].split("|") # type: ignore | ||
try: | ||
# DB_Xref(s) | ||
inrec[7] = inrec[7].split("|") # type: ignore | ||
except IndexError: | ||
logging.debug("No index for DB_Xref for this record") | ||
try: | ||
# Properties | ||
inrec[8] = inrec[8].split("|") # type: ignore | ||
except IndexError: | ||
logging.debug("No index for Properties for this record") | ||
|
||
yield dict(zip(GPI11FIELDS, inrec)) | ||
|
||
|
||
# from biopython, to avoid dependency | ||
def _gpa11iterator(handle): | ||
"""Read GPA 1.1 format files (PRIVATE). | ||
This iterator is used to read a gp_association.goa_uniprot | ||
file which is in the GPA 1.1 format. Do not call directly. Rather | ||
use the gpa_iterator function | ||
""" | ||
# GPA version 1.1 | ||
GPA11FIELDS = [ | ||
"DB", | ||
"DB_Object_ID", | ||
"Qualifier", | ||
"GO_ID", | ||
"DB:Reference", | ||
"ECO_Evidence_code", | ||
"With", | ||
"Interacting_taxon_ID", | ||
"Date", | ||
"Assigned_by", | ||
"Annotation Extension", | ||
"Annotation_Properties", | ||
] | ||
for inline in handle: | ||
if inline[0] == "!": | ||
continue | ||
inrec = inline.rstrip("\n").split("\t") | ||
if len(inrec) == 1: | ||
continue | ||
inrec[2] = inrec[2].split("|") # Qualifier | ||
inrec[4] = inrec[4].split("|") # DB:Reference(s) | ||
inrec[6] = inrec[6].split("|") # With | ||
inrec[10] = inrec[10].split("|") # Annotation extension | ||
yield dict(zip(GPA11FIELDS, inrec)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
!gpa-version: 1.1 | ||
!Columns: | ||
! | ||
! name required? cardinality GAF column # | ||
! DB required 1 1 | ||
! DB_Object_ID required 1 2 / 17 | ||
! Qualifier required 1 or greater 4 | ||
! GO ID required 1 5 | ||
! DB:Reference(s) required 1 or greater 6 | ||
! ECO evidence code required 1 7 + 6 (GO evidence code + reference) | ||
! With optional 0 or greater 8 | ||
! Interacting taxon ID optional 0 or 1 13 | ||
! Date required 1 14 | ||
! Assigned_by required 1 15 | ||
! Annotation Extension optional 0 or greater 16 | ||
! Annotation Properties optional 0 or 1 n/a | ||
! | ||
!Generated: 2020-03-26 11:33 | ||
!GO-version: http://purl.obolibrary.org/obo/go/releases/2020-03-24/extensions/go-plus.owl | ||
! | ||
UniProtKB P0DTC1 enables GO:0003723 GO_REF:0000043 ECO:0000322 UniProtKB-KW:KW-0694 20200321 UniProt go_evidence=IEA | ||
UniProtKB P0DTC1 involved_in GO:0090305 GO_REF:0000108 ECO:0000366 GO:0004519 20200322 GOC go_evidence=IEA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
!gpi-version: 1.2 | ||
! | ||
!This file contains additional information for proteins in the UniProt KnowledgeBase (UniProtKB). | ||
!Protein accessions are represented in this file even if there is no associated GO annotation. | ||
! | ||
!Columns: | ||
! | ||
! name required? cardinality GAF column # Example content | ||
! DB required 1 1 UniProtKB | ||
! DB_Object_ID required 1 2/17 Q4VCS5-1 | ||
! DB_Object_Symbol required 1 3 AMOT | ||
! DB_Object_Name optional 0 or greater 10 Angiomotin | ||
! DB_Object_Synonym(s) optional 0 or greater 11 AMOT|KIAA1071 | ||
! DB_Object_Type required 1 12 protein | ||
! Taxon required 1 13 taxon:9606 | ||
! Parent_Object_ID optional 0 or 1 - UniProtKB:Q4VCS5 | ||
! DB_Xref(s) optional 0 or greater - WB:WBGene00000035 | ||
! Properties optional 0 or greater - db_subset=Swiss-Prot|target_set=KRUK,BHFL | ||
! | ||
!Generated: 2020-03-24 15:52 | ||
! | ||
UniProtKB P0DTD2 P0DTD2 Protein 9b protein taxon:2697049 | ||
UniProtKB A0A663DJA2 ORF10 ORF10 protein ORF10 protein taxon:2697049 |
Oops, something went wrong.