In [82]:
import json
import gzip
import copy


from scipy.stats import fisher_exact
import numpy as np
import pandas as pd

In [88]:
# Parse the gene ontology
with gzip.open("data/go.json.gz") as f:
    ontology = json.load(f)

parents = {}  # { term : list_of_parent_terms }
for edge in ontology["graphs"][0]["edges"]:
    # select only is_a edges
    if edge["pred"] == "is_a":
        parents.setdefault(edge["sub"].split("_")[1], []).append(edge["obj"].split("_")[1])

nodes = []  # list of terms
labels = {}  # { term : definition }
for node in ontology["graphs"][0]["nodes"]:
    # exclude obsolete terms
    if "GO_" in node["id"] and "deprecated" not in node["meta"]:
        nodes.append(node["id"].split("_")[1])
        labels[node["id"].split("_")[1]] = node["lbl"]

print("Total nodes {}\nparsed nodes {}\nnodes with parents {}\n".format(len(ontology["graphs"][0]["nodes"]), len(nodes), len(parents)))

Total nodes 50038
parsed nodes 44650
nodes with parents 44647



In [89]:
roots = set(nodes) - set(parents.keys())
print("Roots: {}\n".format([(r, labels[r]) for r in roots]))

Roots: [('0003674', 'molecular_function'), ('0005575', 'cellular_component'), ('0008150', 'biological_process')]



In [90]:
# Build an ancestors dictionary
ancestors = {}  # { term : list_of_ancestor_terms }
for node in nodes:
    node_ancestors = []
    node_parents = parents.get(node)
    # Loop parent levels until no more parents
    while node_parents:
        node_ancestors.extend(node_parents)
        # Get the parents of current parents (1 level up)
        node_parents = [term for parent in node_parents for term in parents.get(parent, [])]
    ancestors[node] = node_ancestors

In [91]:
# *** Calculate the minimum depth (distance from the root) of each term
depth = {}  # { term : min_depth }
for node in nodes:
    c = 0  # Depth level
    node_parents = parents.get(node)
    while node_parents:
        c += 1
        if roots.intersection(set(node_parents)):  # break the loop if the root is among parents
            break
        # Get the parents of current parents (1 level up)
        node_parents = [term for parent in node_parents for term in parents.get(parent, [])]
    depth[node] = c

In [93]:
depth["0003674"]

0

In [6]:
def gen_block(f):
    """
    Parse and split the input.
    The input must be sorted by target name, second column.

    UniProtKB       A0A024R1R8      hCG_2014768             GO:0002181      PMID:21873635   IBA     PANTHER:PTN002008372|SGD:S000007246     P       HCG2014768, isoform CRA_a       hCG_2014768     protein taxon:9606      20171102        GO_Central
    UniProtKB       A0A024RBG1      NUDT4B          GO:0003723      GO_REF:0000037  IEA     UniProtKB-KW:KW-0694    F       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20191109        UniProt
    UniProtKB       A0A024RBG1      NUDT4B          GO:0005829      GO_REF:0000052  IDA             C       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20161204        HPA
    """
    name, old_name = None, None
    chunk = []
    for line in f:
        line = line.decode()
        if line and line[0] != "!":
            _, name, _, _, term, _, ec, _, namespace, protein_name = line.split("\t")[:10]
            term = term[3:]  # remove "GO:" from the term ID
            if name != old_name and old_name:
                yield (old_name, set(chunk))  # return a set as there can be repetitions, i.e. the same term with different evidence codes
                chunk = []
            old_name = name
            chunk.append(term)
    # Last line
    if old_name:
        yield (old_name, set(chunk))

## create a dictionary with all the annotations

In [7]:
protein_to_go = {}  # { protein_id : (GO terms) }
with gzip.open("data/goa_human.gaf.gz") as f:
    for acc, annotations in gen_block(f):
        protein_to_go[acc] = annotations

In [17]:
with open("data/original.txt") as f:
    dataset = f.read().splitlines() 

In [18]:
with open("data/swiss-human-id.txt") as f:
    background_dataset = f.read().splitlines() 

## lets check if all the proteins in the dataset are annotated

In [19]:
len(background_dataset)

20367

In [20]:
len(dataset)

92

In [21]:
len(set(protein_to_go.keys()))

19473

In [22]:
len(set(background_dataset).intersection(set(dataset)))

92

In [23]:
len(set(protein_to_go.keys()).intersection(set(dataset)))

91

## orko can one is missing 

# WHO?

In [24]:
set(dataset) - set(protein_to_go.keys()).intersection(set(dataset))

{'Q8TC17'}

## thiz fucker

<br>
<br>
<br>
<br>




# Enrichment

In [55]:
def count_ancestors(protein_list):
    counts = {}

    for protein in set(protein_list).intersection(set(protein_to_go.keys())):
        annotations = protein_to_go[protein]

        terms_ancestors = copy.copy(annotations)  # annotations + ancestor terms
        for term in annotations:  # directly annotated terms
            terms_ancestors.update(set(ancestors.get(term, [])))  # add ancestors
        for term in terms_ancestors:
            counts.setdefault(term, 0)
            counts[term] += 1
        
    return counts

In [61]:
dataset_count = count_ancestors(dataset)
background_count = count_ancestors(background_dataset)

In [62]:
print(len(dataset_count), len(background_count), len(set(dataset_count.keys()).intersection(set(background_count.keys()))))

2591 22298 2591


In [133]:
# imput: two dict of the type {GO_term: count} 
#        a dict of depth and one with the labels
def fisher_test(d_count, bg_count, depth, l):
    
    # Init result dict
    results = {}
    
    # Get the tot number of counts
    tot_d = np.sum(list(d_count.values()))
    tot_bg = np.sum(list(bg_count.values()))
    
    
    key_intersection = set(d_count.keys()).intersection(set(bg_count.keys()))
    
    for key in key_intersection:
        ### 1. Set frequencies
        # Number of occurrences of the specific GO term in d_count   
        a = d_count[key]
        # Number of occurrences of the specific GO term in bg_count
        b = bg_count[key]
        # Number of GO terms that are different from the specific one in d_count
        not_a = tot_d - a
        # Number of GO terms that are different from the specific one in bg_count
        not_b = tot_bg - b
        # 2. Perform Fisher Exact Test
        fisher_results = fisher_exact([[a, b],[not_a, not_b]])
        # 3. Save results
        results.setdefault(key, {'OddRatio': fisher_results[0], 'p-value': fisher_results[1],
                                'depth': depth[key], 'label': l[key]})
    
    # Return the DataFrame
    return pd.DataFrame(results).transpose()

In [134]:
pesce = fisher_test(dataset_count, background_count, depth, labels)

In [135]:
pesce.sort_values(by = "OddRatio", ascending=False, inplace=True)

In [136]:
pesce.head(50)

Unnamed: 0,OddRatio,p-value,depth,label
35685,102.965,0.000548114,4,helper T cell diapedesis
33277,102.965,0.000548114,4,abortive mitotic cell cycle
16170,102.959,0.0191471,5,interleukin-15 receptor binding
97699,102.959,0.0191471,5,vascular endothelial cell response to fluid sh...
32752,102.959,0.0191471,6,positive regulation of interleukin-3 production
1990859,102.959,0.0191471,6,cellular response to endothelin
1904618,102.959,0.0191471,6,positive regulation of actin binding
45399,102.959,0.0191471,6,regulation of interleukin-3 biosynthetic process
8269,102.959,0.0191471,5,JAK pathway signal transduction adaptor activity
1903674,102.959,0.0191471,9,regulation of cap-dependent translational init...


In [118]:
max_depth = pesce.depth <= 2

In [119]:
pesce[max_depth]

Unnamed: 0,OddRatio,p-value,depth
0001545,102.958708,1.914707e-02,2.0
0002339,34.319527,3.792755e-02,2.0
0035426,25.739630,4.718252e-02,2.0
0031294,22.894917,1.904113e-12,2.0
0030061,14.708333,7.441668e-02,2.0
...,...,...,...
0005815,0.155464,2.574858e-02,2.0
0051606,0.151347,1.779443e-02,2.0
0007275,0.138695,1.280081e-02,2.0
0038023,0.068944,1.464697e-05,2.0
