Skip to content

Commit

Permalink
Merge pull request #76 from Knowledge-Graph-Hub/sars_cov2_gene_annota…
Browse files Browse the repository at this point in the history
…tions

SARS-CoV-2 gene annotations
  • Loading branch information
deepakunni3 authored Apr 10, 2020
2 parents 835a442 + a16db61 commit 3c90a08
Show file tree
Hide file tree
Showing 10 changed files with 355 additions and 7 deletions.
15 changes: 15 additions & 0 deletions download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,18 @@
-
url: https://s3.pgkb.org/data/genes.zip
local_name: pharmgkb_genes.zip

#
# SARS-CoV-2 gene annotations from Uniprot
#

-
# Gene Product Association Data for SARS-CoV-2 genes in GPAD format
# http://geneontology.org/docs/gene-product-association-data-gpad-format/
url: ftp://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_sars-cov-2.gpa
local_name: uniprot_sars-cov-2.gpa
-
# Gene Product information for SARS-CoV-2 genes in GPI format
# http://geneontology.org/docs/gene-product-information-gpi-format/
url: ftp://ftp.ebi.ac.uk/pub/contrib/goa/uniprot_sars-cov-2.gpi
local_name: uniprot_sars-cov-2.gpi
5 changes: 4 additions & 1 deletion kg_covid_19/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from kg_covid_19.transform_utils.drug_central.drug_central import DrugCentralTransform
from kg_covid_19.transform_utils.hpo.hpo import HpoTransform
from kg_covid_19.transform_utils.\
sars_cov_2_gene_annot.sars_cov_2_gene_annot import SARSCoV2GeneAnnot
from kg_covid_19.transform_utils.scibite_cord import ScibiteCordTransform
from kg_covid_19.transform_utils.pharmgkb import PharmGKB
from kg_covid_19.transform_utils.string_ppi import StringTransform
Expand All @@ -19,7 +21,8 @@
'TTDTransform': TTDTransform,
'StringTransform': StringTransform,
'ScibiteCordTransform': ScibiteCordTransform,
'PharmGKB': PharmGKB
'PharmGKB': PharmGKB,
'SARSCoV2GeneAnnot': SARSCoV2GeneAnnot
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,17 @@
Input: any file in data/raw/ (that was downloaded by placing a URL in incoming.txt/yaml and running `run.py download`
Output: transformed data in data/raw/[source name]:
Either TSV, output these two files:
Output these two files:
- nodes.tsv
- edges.tsv
Or JSON, all in one file: nodes_edges.json
"""


class YourTransform(Transform):

def __init__(self):
super().__init__(source_name="some_unique_name")
def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "some_unique_name"
super().__init__(source_name, input_dir, output_dir)

def run(self):
# replace with downloaded data of for this source
Expand Down
5 changes: 5 additions & 0 deletions kg_covid_19/transform_utils/sars_cov_2_gene_annot/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .sars_cov_2_gene_annot import SARSCoV2GeneAnnot

__all__ = [
"SARSCoV2GeneAnnot"
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
from typing import Generator, TextIO, List

from kg_covid_19.utils.transform_utils import get_item_by_priority, ItemInDictNotFound

from kg_covid_19.transform_utils.transform import Transform
from kg_covid_19.utils import write_node_edge_item

"""Parse the GPA and GPI files for SARS-CoV-2 gene annotations, including GO annotations
and more. Make a node for each gene using GPI lines, and make an edge for
each gene -> annotation described in GPA
"""


class SARSCoV2GeneAnnot(Transform):

def __init__(self, input_dir: str = None, output_dir: str = None):
source_name = "sars_cov_2_gene_annot"
super().__init__(source_name, input_dir, output_dir)

self.node_header = ['id', 'name', 'category', 'synonym', 'taxon']
self.edge_header = ['subject', 'edge_label', 'object', 'relation',
'DB_References', 'ECO_code', 'With', 'Interacting_taxon_ID',
'Date', 'Assigned_by', 'Annotation_Extension',
'Annotation_Properties']

self.protein_node_type = "biolink:Protein"
self.ncbi_taxon_prefix = "NCBITaxon:"

# translate edge labels to RO term, for the 'relation' column in edge
self.edge_label_prefix = "biolink:" # prepend to edge label
self.edge_label_to_RO_term: dict = {
'enables': 'RO:0002327',
'involved_in': 'RO:0002331',
'part_of': 'BFO:0000050'
}

def run(self):

# file housekeeping
os.makedirs(self.output_dir, exist_ok=True)

gpi_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpi")
gpa_file = os.path.join(self.input_base_dir, "uniprot_sars-cov-2.gpa")

with open(self.output_node_file, 'w') as node, \
open(self.output_edge_file, 'w') as edge:

# write headers
node.write("\t".join(self.node_header) + "\n")
edge.write("\t".join(self.edge_header) + "\n")

with open(gpi_file, 'r') as gpi_fh:
for rec in _gpi12iterator(gpi_fh):
node_data = self.gpi_to_gene_node_data(rec)
write_node_edge_item(node, self.node_header, node_data)

with open(gpa_file, 'r') as gpa_fh:
for rec in _gpa11iterator(gpa_fh):
edge_data = self.gpa_to_edge_data(rec)
write_node_edge_item(edge, self.edge_header, edge_data)

def gpa_to_edge_data(self, rec: dict) -> list:
"""given a parsed gpa entry, return an edge with the annotations
:param rec: record from gpa iterator
:return:
"""
subj: str = self._rec_to_id(rec)
edge_label: str = get_item_by_priority(rec, ['Qualifier'])[0]
obj: str = get_item_by_priority(rec, ['GO_ID'])
try:
relation: str = self.edge_label_to_RO_term[edge_label]
except KeyError:
relation = ''

edge_data = [subj, self.edge_label_prefix + edge_label, obj, relation]
# all the others
for key in ['DB:Reference', 'ECO_Evidence_code', 'With', 'Interacting_taxon_ID',
'Date', 'Assigned_by', 'Annotation_Extension',
'Annotation_Properties']:
try:
item = get_item_by_priority(rec, [key])
if type(item) is list:
item = item[0]
except (ItemInDictNotFound, IndexError):
item = ''
edge_data.append(item)
return edge_data

def _rec_to_id(self, rec: dict) -> str:
try:
this_id: str = get_item_by_priority(rec, ['DB']) + ":" + \
get_item_by_priority(rec, ['DB_Object_ID'])
except ItemInDictNotFound:
logging.error("Can't make ID for record: %s", "\t".join(rec))
this_id = ''
return this_id

def gpi_to_gene_node_data(self, rec: dict) -> list:
"""given a parsed gpi entry, return a node that can be passed to
write_node_edge_item()
:param rec: record from gpi iterator
:return: list of node items, one for each thing in self.node_header
"""
# ['id', 'name', 'category', 'synonym', 'taxon']
id: str = self._rec_to_id(rec)

try:
name_list = get_item_by_priority(rec, ['DB_Object_Name'])
if name_list is not None and len(name_list) > 0:
name = name_list[0]
else:
name = ''
except (IndexError, ItemInDictNotFound):
name = ''

category = self.protein_node_type
try:
synonym = get_item_by_priority(rec, ['DB_Object_Synonym'])[0]
except (IndexError, ItemInDictNotFound):
synonym = ''
taxon = get_item_by_priority(rec, ['Taxon'])
return [id, name, category, synonym, taxon]


def _gpi12iterator(handle: TextIO) -> Generator:
# Partly cribbed from Biopython GPI 1.1 parser
# There is no GPI 1.2 parser yet, so I made this
"""Read GPI 1.2 format files (PRIVATE).
This iterator is used to read a gp_information.goa_uniprot
file which is in the GPI 1.2 format.
"""
logging.getLogger().setLevel(logging.WARNING)
# GPI version 1.2
GPI11FIELDS = [
"DB",
"DB_Object_ID",
"DB_Object_Symbol",
"DB_Object_Name",
"DB_Object_Synonym",
"DB_Object_Type",
"Taxon",
"Parent_Object_ID",
"DB_Xref",
"Properties",
]
for inline in handle:
if inline[0] == "!":
continue
inrec: List[str] = inline.rstrip("\n").split("\t")
if len(inrec) == 1:
continue
# DB_Object_Name
inrec[2] = inrec[2].split("|") # type: ignore
# DB_Object_Synonym(s)
inrec[3] = inrec[3].split("|") # type: ignore
try:
# DB_Xref(s)
inrec[7] = inrec[7].split("|") # type: ignore
except IndexError:
logging.debug("No index for DB_Xref for this record")
try:
# Properties
inrec[8] = inrec[8].split("|") # type: ignore
except IndexError:
logging.debug("No index for Properties for this record")

yield dict(zip(GPI11FIELDS, inrec))


# from biopython, to avoid dependency
def _gpa11iterator(handle):
"""Read GPA 1.1 format files (PRIVATE).
This iterator is used to read a gp_association.goa_uniprot
file which is in the GPA 1.1 format. Do not call directly. Rather
use the gpa_iterator function
"""
# GPA version 1.1
GPA11FIELDS = [
"DB",
"DB_Object_ID",
"Qualifier",
"GO_ID",
"DB:Reference",
"ECO_Evidence_code",
"With",
"Interacting_taxon_ID",
"Date",
"Assigned_by",
"Annotation Extension",
"Annotation_Properties",
]
for inline in handle:
if inline[0] == "!":
continue
inrec = inline.rstrip("\n").split("\t")
if len(inrec) == 1:
continue
inrec[2] = inrec[2].split("|") # Qualifier
inrec[4] = inrec[4].split("|") # DB:Reference(s)
inrec[6] = inrec[6].split("|") # With
inrec[10] = inrec[10].split("|") # Annotation extension
yield dict(zip(GPA11FIELDS, inrec))
2 changes: 1 addition & 1 deletion kg_covid_19/utils/transform_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def write_node_edge_item(fh: Any, header: List, data: List, sep: str = '\t'):
raise Exception('Header and data are not the same length.')
try:
fh.write(sep.join(data) + "\n")
except:
except IOError:
logging.warning("Can't write data for {}".format(data))


Expand Down
6 changes: 6 additions & 0 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ target:
- data/transformed/SciBite-CORD-19/nodes.tsv
- data/transformed/SciBite-CORD-19/edges.tsv

sars-cov-2-gene-annot:
type: tsv
filename:
- data/transformed/sars_cov_2_gene_annot/nodes.tsv
- data/transformed/sars_cov_2_gene_annot/edges.tsv

destination:
type: tsv
filename: merged-kg
Expand Down
22 changes: 22 additions & 0 deletions tests/resources/uniprot_sars-cov-2_SNIPPET.gpa
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
!gpa-version: 1.1
!Columns:
!
! name required? cardinality GAF column #
! DB required 1 1
! DB_Object_ID required 1 2 / 17
! Qualifier required 1 or greater 4
! GO ID required 1 5
! DB:Reference(s) required 1 or greater 6
! ECO evidence code required 1 7 + 6 (GO evidence code + reference)
! With optional 0 or greater 8
! Interacting taxon ID optional 0 or 1 13
! Date required 1 14
! Assigned_by required 1 15
! Annotation Extension optional 0 or greater 16
! Annotation Properties optional 0 or 1 n/a
!
!Generated: 2020-03-26 11:33
!GO-version: http://purl.obolibrary.org/obo/go/releases/2020-03-24/extensions/go-plus.owl
!
UniProtKB P0DTC1 enables GO:0003723 GO_REF:0000043 ECO:0000322 UniProtKB-KW:KW-0694 20200321 UniProt go_evidence=IEA
UniProtKB P0DTC1 involved_in GO:0090305 GO_REF:0000108 ECO:0000366 GO:0004519 20200322 GOC go_evidence=IEA
23 changes: 23 additions & 0 deletions tests/resources/uniprot_sars-cov-2_SNIPPET.gpi
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
!gpi-version: 1.2
!
!This file contains additional information for proteins in the UniProt KnowledgeBase (UniProtKB).
!Protein accessions are represented in this file even if there is no associated GO annotation.
!
!Columns:
!
! name required? cardinality GAF column # Example content
! DB required 1 1 UniProtKB
! DB_Object_ID required 1 2/17 Q4VCS5-1
! DB_Object_Symbol required 1 3 AMOT
! DB_Object_Name optional 0 or greater 10 Angiomotin
! DB_Object_Synonym(s) optional 0 or greater 11 AMOT|KIAA1071
! DB_Object_Type required 1 12 protein
! Taxon required 1 13 taxon:9606
! Parent_Object_ID optional 0 or 1 - UniProtKB:Q4VCS5
! DB_Xref(s) optional 0 or greater - WB:WBGene00000035
! Properties optional 0 or greater - db_subset=Swiss-Prot|target_set=KRUK,BHFL
!
!Generated: 2020-03-24 15:52
!
UniProtKB P0DTD2 P0DTD2 Protein 9b protein taxon:2697049
UniProtKB A0A663DJA2 ORF10 ORF10 protein ORF10 protein taxon:2697049
Loading

0 comments on commit 3c90a08

Please sign in to comment.