# Term Usage Summary for GO

In [35]:
from oaklib.datamodels.vocabulary import IS_A, PART_OF
from oaklib import get_adapter

go = get_adapter("sqlite:obo:go")
amigo = get_adapter("amigo:")

In [24]:
import pandas as pd
import numpy as np

In [40]:

evidence_codes = ["IBA", "IEA", "All"]
rows = []
for evidence_code in evidence_codes:
    print(f"Summary for evidence code: {evidence_code}")
    filter = {} if evidence_code == "All" else {"evidence_type": evidence_code}
    preds = [IS_A, PART_OF]
    term_counts = amigo.association_counts(object_closure_predicates=preds, property_filter=filter, min_facet_count=1, limit=-1)
    rows += [(term, evidence_code, count) for term, count in term_counts if term.startswith("GO:")]
    term_counts = amigo.association_counts(property_filter=filter, min_facet_count=1, limit=-1)
    rows += [(term, evidence_code+"_direct", count) for term, count in term_counts]
melted_df = pd.DataFrame(rows, columns=["term", "code", "count"])
melted_df

Summary for evidence code: IBA
Summary for evidence code: IEA
Summary for evidence code: All


Unnamed: 0,term,code,count
0,GO:0000002,IBA,759
1,GO:0000003,IBA,16743
2,GO:0000006,IBA,5
3,GO:0000007,IBA,8
4,GO:0000009,IBA,178
...,...,...,...
129808,GO:2001307,All_direct,3
129809,GO:2001310,All_direct,12
129810,GO:2001311,All_direct,36
129811,GO:2001315,All_direct,1


In [41]:
pivoted_df = melted_df.pivot(index='term', columns='code', values='count').fillna(0).astype(int)
pivoted_df

code,All,All_direct,IBA,IBA_direct,IEA,IEA_direct
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GO:0000001,145,145,0,0,102,102
GO:0000002,1473,564,759,168,411,279
GO:0000003,46815,302,16743,158,9892,5
GO:0000006,16,16,5,5,6,6
GO:0000007,14,14,8,8,5,5
...,...,...,...,...,...,...
GO:2001311,36,36,21,21,3,3
GO:2001313,1,0,0,0,0,0
GO:2001315,1,1,0,0,0,0
GO:2001316,3,0,0,0,0,0


In [6]:

roots = ["GO:0008150", "GO:0005575", "GO:0003674"]


In [27]:
isa_partof_leafs = list(go.leafs(predicates=[IS_A, PART_OF], filter_obsoletes=True))

In [8]:
isa_leafs = list(go.leafs(predicates=[IS_A], filter_obsoletes=True))


In [28]:
from oaklib.datamodels.vocabulary import REGULATES, NEGATIVELY_REGULATES, POSITIVELY_REGULATES

all_leafs = list(go.leafs(predicates=[IS_A, PART_OF, REGULATES, NEGATIVELY_REGULATES, POSITIVELY_REGULATES], filter_obsoletes=True))


In [29]:
objs = []
for root in roots:
    root_label = go.label(root)
    print(f"Processing root: {root} {root_label}")
    for term in go.descendants(root, predicates=[IS_A]):
        obj = {
            "term": term,
            "label": go.label(term),
            "group:": root_label,
            "isa_partof_leaf": term in leafs,
            "isa_leaf": term in isa_leafs,
            "full_leaf": term in all_leafs,
        }
        objs.append(obj)
        

Processing root: GO:0008150 biological_process
Processing root: GO:0005575 cellular_component
Processing root: GO:0003674 molecular_function


In [30]:
terms_df = pd.DataFrame(objs)
terms_df

Unnamed: 0,term,label,group:,isa_partof_leaf,isa_leaf,full_leaf
0,GO:0075032,negative regulation of formation of symbiont g...,biological_process,True,True,True
1,GO:0010032,meiotic chromosome condensation,biological_process,True,True,True
2,GO:0070458,cellular detoxification of nitrogen compound,biological_process,True,True,True
3,GO:0060235,lens induction in camera-type eye,biological_process,True,True,True
4,GO:2000261,"negative regulation of blood coagulation, comm...",biological_process,True,True,True
...,...,...,...,...,...,...
42893,GO:0052814,medium-chain-aldehyde dehydrogenase activity,molecular_function,True,True,True
42894,GO:0052587,diacetyl reductase ((R)-acetoin forming) activity,molecular_function,True,True,True
42895,GO:0102645,17(E)-cheilanthenediol synthase activity,molecular_function,True,True,True
42896,GO:0047999,hyponitrite reductase activity,molecular_function,True,True,True


In [42]:
df = terms_df.merge(pivoted_df, left_on="term", right_on="term", how="left")
cols = df.select_dtypes(include=[np.number]).columns
df[cols] = df[cols].fillna(0).astype(int)
df

Unnamed: 0,term,label,group:,isa_partof_leaf,isa_leaf,full_leaf,All,All_direct,IBA,IBA_direct,IEA,IEA_direct
0,GO:0075032,negative regulation of formation of symbiont g...,biological_process,True,True,True,0,0,0,0,0,0
1,GO:0010032,meiotic chromosome condensation,biological_process,True,True,True,340,340,291,291,22,22
2,GO:0070458,cellular detoxification of nitrogen compound,biological_process,True,True,True,43,43,0,0,21,21
3,GO:0060235,lens induction in camera-type eye,biological_process,True,True,True,29,29,0,0,15,15
4,GO:2000261,"negative regulation of blood coagulation, comm...",biological_process,True,True,True,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
42893,GO:0052814,medium-chain-aldehyde dehydrogenase activity,molecular_function,True,True,True,10,10,0,0,2,2
42894,GO:0052587,diacetyl reductase ((R)-acetoin forming) activity,molecular_function,True,True,True,2,2,0,0,2,2
42895,GO:0102645,17(E)-cheilanthenediol synthase activity,molecular_function,True,True,True,0,0,0,0,0,0
42896,GO:0047999,hyponitrite reductase activity,molecular_function,True,True,True,0,0,0,0,0,0


In [43]:
# sort df by group then term
df = df.sort_values(by=["group:", "term"])
df

Unnamed: 0,term,label,group:,isa_partof_leaf,isa_leaf,full_leaf,All,All_direct,IBA,IBA_direct,IEA,IEA_direct
5134,GO:0000001,mitochondrion inheritance,biological_process,True,True,True,145,145,0,0,102,102
12729,GO:0000002,mitochondrial genome maintenance,biological_process,False,True,False,1473,564,759,168,411,279
18901,GO:0000003,reproduction,biological_process,False,False,False,46815,302,16743,158,9892,5
24355,GO:0000011,vacuole inheritance,biological_process,True,True,True,203,203,22,22,141,141
17370,GO:0000012,single strand break repair,biological_process,False,False,False,189,189,97,97,44,44
...,...,...,...,...,...,...,...,...,...,...,...,...
39756,GO:2001083,alpha-D-glucan binding,molecular_function,True,True,True,0,0,0,0,0,0
31732,GO:2001084,L-arabinofuranose binding,molecular_function,True,True,True,0,0,0,0,0,0
38649,GO:2001085,arabinogalactan binding,molecular_function,True,True,True,0,0,0,0,0,0
33826,GO:2001147,camalexin binding,molecular_function,True,True,True,2,2,0,0,0,0


In [44]:
df.to_csv("output/go-term-usage-summary.tsv", index=False, sep="\t")