In [1]:
import collections
import os

import pandas
import networkx

import utilities
from math import gcd


In [2]:
download_dir = os.path.join('..', 'download')
annotation_dir = os.path.join('..', 'annotations')

In [3]:
remove_subsets = {
    'goantislim_grouping', # Grouping classes that can be excluded
    'gocheck_do_not_annotate' # Term not to be used for direct annotation
    'gocheck_do_not_manually_annotate', # Term not to be used for direct manual annotation
}

propagate_along = {'is_a', 'part_of'}

experimental_codes = {
    'EXP', # Inferred from Experiment
    'IDA', # Inferred from Direct Assay
    'IPI', # Inferred from Physical Interaction
    'IMP', # Inferred from Mutant Phenotype
    'IGI', # Inferred from Genetic Interaction
    'IEP', # Inferred from Expression Pattern
}

## Read Gene Ontology graph

In [4]:
graph = utilities.read_go(download_dir)
print(networkx.info(graph))

MultiDiGraph named 'go' with 40665 nodes and 78698 edges


In [5]:
# dataframe of GO terms
go_df = utilities.graph_to_dataframe(graph)
go_df.head(2)

Unnamed: 0,go_id,go_name,go_domain
0,GO:0000001,mitochondrion inheritance,biological_process
1,GO:0000002,mitochondrial genome maintenance,biological_process


In [6]:
# Remove nodes that should not be annotated
remove_nodes = set()
for node, data in graph.nodes(data=True):
    if remove_subsets & set(data.get('subset', [])):
        remove_nodes.add(node)
        #graph.remove_node(node)

# Remove edges that should not be propagated along
remove_edges = []
for u, v, key in graph.edges(data=False, keys=True):
    if key not in propagate_along:
        remove_edges.append((u, v, key))

for u, v, key in remove_edges:
    graph.remove_edge(u, v, key)

assert networkx.is_directed_acyclic_graph(graph)
print(networkx.info(graph))

MultiDiGraph named 'go' with 40665 nodes and 70415 edges


## Read Entrez Gene and annotations

In [7]:
# Read Entrez Gene info
gene_df = utilities.read_gene_info(download_dir)
gene_df = gene_df[['GeneID', 'Symbol', 'type_of_gene', 'tax_id']]
gene_df.head(2)

  return pandas.read_table(path, comment='#', names=column_names, na_values=['-'], dtype=dtype, index_col=False)


  return pandas.read_table(path, comment='#', names=column_names, na_values=['-'], dtype=dtype, index_col=False)


Unnamed: 0,GeneID,Symbol,type_of_gene,tax_id
0,5692769,NEWENTRY,other,7
1,2827857,NEWENTRY,other,9


In [8]:
# Read annotations

# Define 45 TaxIds available in websurface https://git.dhimmel.com/gene-ontology/
taxon_ids_to_include = [9606,10116,10090,3702,7955,7227,6239,559292,9913,4896,352472,9031,227321,511145,214684,9823
                        ,198094,39947,36329,9615,211586,223283,243231,195103,227377,39946,4536,4532,4535,4537,40148,65491,4534,29690,65489
                        ,4528,4529,4538,29689,40149,52545,63629,83307,83308,83309]

goa_df = utilities.read_gene2go(download_dir)
# Filter to include only annotations from the website
goa_df = goa_df[goa_df['tax_id'].isin(taxon_ids_to_include)]

goa_df.head(2)

Unnamed: 0,tax_id,GeneID,GO_ID,Evidence,Qualifier,GO_term,PubMed,Category
1460596,3702,814629,GO:0003674,ND,enables,molecular_function,,Function
1460597,3702,814629,GO:0005634,ISM,located_in,nucleus,,Component


## Add and propagate annotations

In [9]:
def annotate_graph(graph, goa_df):
    """Add direct annotations to graph"""
    graph = graph.copy()
    
    # Add dictionary items for storing annotations
    for node, data in graph.nodes.items():
        for key in 'direct_annotations', 'direct_not_annotations', 'inferred_annotations':
            data[key] = set()

    # Populate direct annotations
    for i, row in goa_df.iterrows():

        go_id = row['GO_ID']
        if go_id not in graph:
            continue

        key = 'direct_not_annotations' if utilities.is_NOT_qaulifier(row.Qualifier) else 'direct_annotations'

        gene = row['GeneID']
        graph.nodes[go_id][key].add(gene)
    
    return graph

In [10]:
def propagate_annotations(graph):
    """Infer annotations through propagations"""
    for node in networkx.topological_sort(graph):
        data = graph.nodes[node]
        inferred = data['inferred_annotations']
        inferred -= data['direct_not_annotations']
        inferred |= data['direct_annotations']
        for subsuming_node in graph.successors(node):
            subsuming_data = graph.nodes[subsuming_node]
            subsuming_data['inferred_annotations'] |= inferred

In [11]:
joiner = lambda x: '|'.join(map(str, x))

def extract_annotation_df(graph):
    """Create an annotation dataframe"""
    rows = list()
    for node, data in graph.nodes.items():
        if node in remove_nodes:
            continue
        for kind in 'direct', 'inferred':
            for gene in data['{}_annotations'.format(kind)]:
                rows.append((node, kind, gene))
    
    annotation_df = pandas.DataFrame(rows, columns=['go_id', 'kind', 'GeneID'])
    annotation_df = annotation_df.merge(gene_df)

    rows = list()
    for (tax_id, kind), taxon_df in annotation_df.groupby(['tax_id', 'kind']):
        for go_id, term_df in taxon_df.groupby('go_id'):
            term_df = term_df.sort_values('GeneID')
            row = tax_id, go_id, kind, len(term_df), joiner(term_df['GeneID']), joiner(term_df['Symbol'])
            rows.append(row)
    wide_df = pandas.DataFrame(rows, columns = ['tax_id', 'go_id', 'annotation_type', 'size', 'gene_ids', 'gene_symbols'])
    wide_df = go_df.merge(wide_df)
    return wide_df

## Extract and save annotations

In [12]:
for ev_type in 'allev', 'expev':
    goa_subset_df = goa_df
    if ev_type == 'expev':
        goa_subset_df = goa_subset_df[goa_subset_df.Evidence.isin(experimental_codes)]
    graph_annot = annotate_graph(graph, goa_subset_df)
    propagate_annotations(graph_annot)
    annotation_df = extract_annotation_df(graph_annot)

    for (tax_id, annotation_type), df in annotation_df.groupby(['tax_id', 'annotation_type']):
        path = utilities.get_annotation_path(annotation_dir, tax_id, annotation_type, ev_type, mkdir=True)
        print(path)
        df.to_csv(path, sep='\t', index=False)

../annotations/taxid_3702/GO_annotations-3702-direct-allev.tsv
../annotations/taxid_3702/GO_annotations-3702-inferred-allev.tsv


../annotations/taxid_4538/GO_annotations-4538-direct-allev.tsv
../annotations/taxid_4538/GO_annotations-4538-inferred-allev.tsv


../annotations/taxid_4896/GO_annotations-4896-direct-allev.tsv
../annotations/taxid_4896/GO_annotations-4896-inferred-allev.tsv
../annotations/taxid_6239/GO_annotations-6239-direct-allev.tsv
../annotations/taxid_6239/GO_annotations-6239-inferred-allev.tsv


../annotations/taxid_7227/GO_annotations-7227-direct-allev.tsv
../annotations/taxid_7227/GO_annotations-7227-inferred-allev.tsv


../annotations/taxid_7955/GO_annotations-7955-direct-allev.tsv
../annotations/taxid_7955/GO_annotations-7955-inferred-allev.tsv


../annotations/taxid_9031/GO_annotations-9031-direct-allev.tsv
../annotations/taxid_9031/GO_annotations-9031-inferred-allev.tsv


../annotations/taxid_9606/GO_annotations-9606-direct-allev.tsv
../annotations/taxid_9606/GO_annotations-9606-inferred-allev.tsv


../annotations/taxid_9615/GO_annotations-9615-direct-allev.tsv
../annotations/taxid_9615/GO_annotations-9615-inferred-allev.tsv


../annotations/taxid_9823/GO_annotations-9823-direct-allev.tsv
../annotations/taxid_9823/GO_annotations-9823-inferred-allev.tsv


../annotations/taxid_9913/GO_annotations-9913-direct-allev.tsv
../annotations/taxid_9913/GO_annotations-9913-inferred-allev.tsv


../annotations/taxid_10090/GO_annotations-10090-direct-allev.tsv
../annotations/taxid_10090/GO_annotations-10090-inferred-allev.tsv


../annotations/taxid_10116/GO_annotations-10116-direct-allev.tsv
../annotations/taxid_10116/GO_annotations-10116-inferred-allev.tsv


../annotations/taxid_36329/GO_annotations-36329-direct-allev.tsv
../annotations/taxid_36329/GO_annotations-36329-inferred-allev.tsv
../annotations/taxid_39947/GO_annotations-39947-direct-allev.tsv
../annotations/taxid_39947/GO_annotations-39947-inferred-allev.tsv


../annotations/taxid_214684/GO_annotations-214684-direct-allev.tsv
../annotations/taxid_214684/GO_annotations-214684-inferred-allev.tsv
../annotations/taxid_227321/GO_annotations-227321-direct-allev.tsv
../annotations/taxid_227321/GO_annotations-227321-inferred-allev.tsv


../annotations/taxid_352472/GO_annotations-352472-direct-allev.tsv
../annotations/taxid_352472/GO_annotations-352472-inferred-allev.tsv
../annotations/taxid_511145/GO_annotations-511145-direct-allev.tsv
../annotations/taxid_511145/GO_annotations-511145-inferred-allev.tsv


../annotations/taxid_559292/GO_annotations-559292-direct-allev.tsv
../annotations/taxid_559292/GO_annotations-559292-inferred-allev.tsv


../annotations/taxid_3702/GO_annotations-3702-direct-expev.tsv
../annotations/taxid_3702/GO_annotations-3702-inferred-expev.tsv
../annotations/taxid_4896/GO_annotations-4896-direct-expev.tsv
../annotations/taxid_4896/GO_annotations-4896-inferred-expev.tsv
../annotations/taxid_6239/GO_annotations-6239-direct-expev.tsv


../annotations/taxid_6239/GO_annotations-6239-inferred-expev.tsv
../annotations/taxid_7227/GO_annotations-7227-direct-expev.tsv
../annotations/taxid_7227/GO_annotations-7227-inferred-expev.tsv
../annotations/taxid_7955/GO_annotations-7955-direct-expev.tsv
../annotations/taxid_7955/GO_annotations-7955-inferred-expev.tsv


../annotations/taxid_9606/GO_annotations-9606-direct-expev.tsv
../annotations/taxid_9606/GO_annotations-9606-inferred-expev.tsv


../annotations/taxid_10090/GO_annotations-10090-direct-expev.tsv
../annotations/taxid_10090/GO_annotations-10090-inferred-expev.tsv


../annotations/taxid_10116/GO_annotations-10116-direct-expev.tsv
../annotations/taxid_10116/GO_annotations-10116-inferred-expev.tsv
../annotations/taxid_36329/GO_annotations-36329-direct-expev.tsv
../annotations/taxid_36329/GO_annotations-36329-inferred-expev.tsv
../annotations/taxid_227321/GO_annotations-227321-direct-expev.tsv
../annotations/taxid_227321/GO_annotations-227321-inferred-expev.tsv
../annotations/taxid_352472/GO_annotations-352472-direct-expev.tsv
../annotations/taxid_352472/GO_annotations-352472-inferred-expev.tsv


../annotations/taxid_511145/GO_annotations-511145-direct-expev.tsv
../annotations/taxid_511145/GO_annotations-511145-inferred-expev.tsv
../annotations/taxid_559292/GO_annotations-559292-direct-expev.tsv
../annotations/taxid_559292/GO_annotations-559292-inferred-expev.tsv
