In [1]:
from oaklib import get_adapter
from oaklib.utilities.associations.edge_information_util import associations_to_matrix


ont_adapter = get_adapter("sqlite:obo:go")


In [30]:
taxa = ["9606", "6239", "7955", "3702", "559292", "4896", "7227", "10090", "ALL"]

In [31]:
ic_map_by_taxon = {}
for tax in taxa:
    n = f"amigo:NCBITaxon:{tax}" if tax != "ALL" else "amigo:"
    assoc_adapter = get_adapter(n)
    ic_map = {term: ic for term, ic in assoc_adapter.information_content_scores(use_associations=True)}
    ic_map_by_taxon[tax] = ic_map

In [32]:
len(ic_map_by_taxon)

9

In [33]:
from typing import Dict


def calculate_edge_information(child: str, parent: str, ic_map: Dict[str, float]):
    if child not in ic_map or parent not in ic_map:
        return None
    child_ic = ic_map[child]
    parent_ic = ic_map[parent]
    return child_ic - parent_ic

In [34]:
c="GO:0008527"
p="GO:0050912"


In [35]:
for tx, matrix in ic_map_by_taxon.items():
    print(tx, calculate_edge_information(c, p, matrix))

9606 0.6844981742720702
6239 None
7955 0.16992500144231393
3702 None
559292 None
4896 None
7227 0.15919859484925425
10090 0.405992359675837
ALL 0.360008134132741


In [36]:
rows = []
for s, p, o in ont_adapter.relationships():
    if not s.startswith("GO:"):
        continue
    if not o.startswith("GO:"):
        continue
    row = {"child": s, "parent": o, "predicate": p}
    for t, matrix in ic_map_by_taxon.items():
        v = calculate_edge_information(s, o, matrix)
        row[t] = v
    rows.append(row)
    if len(rows) % 10000 == 0:
        print(row)
        

{'child': 'GO:0007563', 'parent': 'GO:2000026', 'predicate': 'rdfs:subClassOf', '9606': None, '6239': None, '7955': None, '3702': None, '559292': None, '4896': None, '7227': 5.357552004618083, '10090': None, 'ALL': 10.822457022324627}
{'child': 'GO:0019356', 'parent': 'GO:0019357', 'predicate': 'rdfs:subClassOf', '9606': None, '6239': None, '7955': None, '3702': None, '559292': None, '4896': None, '7227': None, '10090': None, 'ALL': None}
{'child': 'GO:0034346', 'parent': 'GO:0034343', 'predicate': 'RO:0002213', '9606': 0.0, '6239': None, '7955': None, '3702': None, '559292': None, '4896': None, '7227': None, '10090': 0.0, 'ALL': 0.0}
{'child': 'GO:0045831', 'parent': 'GO:0010461', 'predicate': 'RO:0002212', '9606': None, '6239': None, '7955': None, '3702': None, '559292': None, '4896': None, '7227': None, '10090': None, 'ALL': None}
{'child': 'GO:0060295', 'parent': 'GO:0003352', 'predicate': 'rdfs:subClassOf', '9606': 0.4381211123918849, '6239': 0.0, '7955': 0.5405683813627036, '3702

In [37]:
import pandas as pd


In [38]:
def has_val(row: dict):
    for k, v in row.items():
        if k not in ["child", "parent", "predicate"]:
            if v is not None:
                return True
    return False

In [39]:
filtered_rows = [r for r in rows if has_val(r)]

In [40]:
for row in filtered_rows:
    row["child_label"] = ont_adapter.label(row["child"])
    row["parent_label"] = ont_adapter.label(row["parent"])

In [54]:
df = pd.DataFrame(filtered_rows)
df

Unnamed: 0,child,parent,predicate,9606,6239,7955,3702,559292,4896,7227,10090,ALL,child_label,parent_label
0,GO:0000001,GO:0005739,obo:GOREL_0002003,,,,,5.872605,8.577429,7.812177,,9.414945,mitochondrion inheritance,mitochondrion
1,GO:0000001,GO:0048308,rdfs:subClassOf,,,,,1.373458,0.584963,0.000000,,1.982152,mitochondrion inheritance,organelle inheritance
2,GO:0000001,GO:0048311,rdfs:subClassOf,,,,,0.241008,1.807355,1.459432,,1.942707,mitochondrion inheritance,mitochondrion distribution
3,GO:0000002,GO:0007005,rdfs:subClassOf,4.056831,4.38247,4.551796,3.658211,2.390915,2.929611,4.650254,3.865499,3.703148,mitochondrial genome maintenance,mitochondrion organization
4,GO:0000006,GO:0005385,rdfs:subClassOf,,,,,3.169925,2.584963,,,7.229558,high-affinity zinc transmembrane transporter a...,zinc ion transmembrane transporter activity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65831,GO:2001316,GO:0120254,rdfs:subClassOf,,,,,,,,,9.466926,kojic acid metabolic process,olefinic compound metabolic process
65832,GO:2001317,GO:0034309,rdfs:subClassOf,,,,,,,,,7.176589,kojic acid biosynthetic process,primary alcohol biosynthetic process
65833,GO:2001317,GO:0042181,rdfs:subClassOf,,,,,,,,,9.716534,kojic acid biosynthetic process,ketone biosynthetic process
65834,GO:2001317,GO:0120255,rdfs:subClassOf,,,,,,,,,6.481127,kojic acid biosynthetic process,olefinic compound biosynthetic process


In [55]:
!mkdir -p results

In [56]:
df.to_csv("results/edge_information.csv", index=False)

In [57]:
taxa_columns = df.columns[3:-2]
species_columns = df.columns[3:-3]

In [58]:
import numpy as np

# Replace negative ICdiff values with NaN (treated as None)
df[taxa_columns] = df[taxa_columns].applymap(lambda x: np.nan if x < 0 else x)



  df[taxa_columns] = df[taxa_columns].applymap(lambda x: np.nan if x < 0 else x)


In [59]:
# Recalculate the average ICdiff for each edge
df['average_ICdiff'] = df[taxa_columns].mean(axis=1, skipna=True)
df['average_ICdiff_sp'] = df[species_columns].mean(axis=1, skipna=True)


In [60]:
# Recalculate the maximum ICdiff for each edge
df['max_ICdiff'] = df[taxa_columns].max(axis=1, skipna=True)
df['max_ICdiff_sp'] = df[species_columns].max(axis=1, skipna=True)

# Identify edges with the lowest average ICdiff
lowest_average_ICdiff_edges = df.loc[df['average_ICdiff'] == df['average_ICdiff'].min()]

# Identify edges with the lowest maximum ICdiff
lowest_max_ICdiff_edges = df.loc[df['max_ICdiff'] == df['max_ICdiff'].min()]

# Results
low_icdiff_stats = {
    'lowest_average_ICdiff_edges': lowest_average_ICdiff_edges,
    'lowest_max_ICdiff_edges': lowest_max_ICdiff_edges,
    'average_ICdiff_per_edge': df[['child', 'parent', 'predicate', 'average_ICdiff']],
    'max_ICdiff_per_edge': df[['child', 'parent', 'predicate', 'max_ICdiff']]
}

In [61]:
low_icdiff_stats

{'lowest_average_ICdiff_edges':             child      parent        predicate  9606  6239  7955  3702  \
 49     GO:0000036  GO:0044620  rdfs:subClassOf   0.0   0.0   0.0   0.0   
 50     GO:0000036  GO:0140414  rdfs:subClassOf   0.0   0.0   0.0   0.0   
 88     GO:0000073  GO:1905047      BFO:0000050   NaN   NaN   NaN   NaN   
 89     GO:0000073  GO:0110100  rdfs:subClassOf   NaN   NaN   NaN   NaN   
 100    GO:0000080  GO:0051318  rdfs:subClassOf   0.0   NaN   NaN   0.0   
 ...           ...         ...              ...   ...   ...   ...   ...   
 65782  GO:2001299  GO:2001298  rdfs:subClassOf   0.0   NaN   NaN   NaN   
 65806  GO:2001307  GO:2001305  rdfs:subClassOf   NaN   NaN   NaN   NaN   
 65820  GO:2001310  GO:2001308  rdfs:subClassOf   NaN   NaN   NaN   NaN   
 65828  GO:2001315  GO:2001313  rdfs:subClassOf   NaN   NaN   NaN   NaN   
 65835  GO:2001317  GO:2001316  rdfs:subClassOf   NaN   NaN   NaN   NaN   
 
        559292  4896  7227  10090  ALL  \
 49        0.0   0.0   0.

In [62]:
df

Unnamed: 0,child,parent,predicate,9606,6239,7955,3702,559292,4896,7227,10090,ALL,child_label,parent_label,average_ICdiff,average_ICdiff_sp,max_ICdiff,max_ICdiff_sp
0,GO:0000001,GO:0005739,obo:GOREL_0002003,,,,,5.872605,8.577429,7.812177,,9.414945,mitochondrion inheritance,mitochondrion,7.919289,7.420737,9.414945,8.577429
1,GO:0000001,GO:0048308,rdfs:subClassOf,,,,,1.373458,0.584963,0.000000,,1.982152,mitochondrion inheritance,organelle inheritance,0.985143,0.652807,1.982152,1.373458
2,GO:0000001,GO:0048311,rdfs:subClassOf,,,,,0.241008,1.807355,1.459432,,1.942707,mitochondrion inheritance,mitochondrion distribution,1.362625,1.169265,1.942707,1.807355
3,GO:0000002,GO:0007005,rdfs:subClassOf,4.056831,4.38247,4.551796,3.658211,2.390915,2.929611,4.650254,3.865499,3.703148,mitochondrial genome maintenance,mitochondrion organization,3.798748,3.810698,4.650254,4.650254
4,GO:0000006,GO:0005385,rdfs:subClassOf,,,,,3.169925,2.584963,,,7.229558,high-affinity zinc transmembrane transporter a...,zinc ion transmembrane transporter activity,4.328149,2.877444,7.229558,3.169925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65831,GO:2001316,GO:0120254,rdfs:subClassOf,,,,,,,,,9.466926,kojic acid metabolic process,olefinic compound metabolic process,9.466926,,9.466926,
65832,GO:2001317,GO:0034309,rdfs:subClassOf,,,,,,,,,7.176589,kojic acid biosynthetic process,primary alcohol biosynthetic process,7.176589,,7.176589,
65833,GO:2001317,GO:0042181,rdfs:subClassOf,,,,,,,,,9.716534,kojic acid biosynthetic process,ketone biosynthetic process,9.716534,,9.716534,
65834,GO:2001317,GO:0120255,rdfs:subClassOf,,,,,,,,,6.481127,kojic acid biosynthetic process,olefinic compound biosynthetic process,6.481127,,6.481127,


In [63]:
df[df["max_ICdiff"] == 0]

Unnamed: 0,child,parent,predicate,9606,6239,7955,3702,559292,4896,7227,10090,ALL,child_label,parent_label,average_ICdiff,average_ICdiff_sp,max_ICdiff,max_ICdiff_sp
49,GO:0000036,GO:0044620,rdfs:subClassOf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,acyl carrier activity,ACP phosphopantetheine attachment site binding,0.0,0.0,0.0,0.0
50,GO:0000036,GO:0140414,rdfs:subClassOf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,acyl carrier activity,phosphopantetheine-dependent carrier activity,0.0,0.0,0.0,0.0
88,GO:0000073,GO:1905047,BFO:0000050,,,,,0.0,0.0,,,0.0,initial mitotic spindle pole body separation,mitotic spindle pole body organization,0.0,0.0,0.0,0.0
89,GO:0000073,GO:0110100,rdfs:subClassOf,,,,,0.0,0.0,,,0.0,initial mitotic spindle pole body separation,spindle pole body separation,0.0,0.0,0.0,0.0
100,GO:0000080,GO:0051318,rdfs:subClassOf,0.0,,,0.0,0.0,,,0.0,0.0,mitotic G1 phase,G1 phase,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65782,GO:2001299,GO:2001298,rdfs:subClassOf,0.0,,,,,,,0.0,0.0,"N(omega),N(omega)-dimethyl-L-arginine cataboli...","N(omega),N(omega)-dimethyl-L-arginine metaboli...",0.0,0.0,0.0,0.0
65806,GO:2001307,GO:2001305,rdfs:subClassOf,,,,,,,,,0.0,xanthone-containing compound biosynthetic process,xanthone-containing compound metabolic process,0.0,,0.0,
65820,GO:2001310,GO:2001308,rdfs:subClassOf,,,,,,,,,0.0,gliotoxin biosynthetic process,gliotoxin metabolic process,0.0,,0.0,
65828,GO:2001315,GO:2001313,rdfs:subClassOf,,,,,,,,,0.0,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,UDP-4-deoxy-4-formamido-beta-L-arabinopyranose...,0.0,,0.0,


In [51]:
df[df["max_ICdiff"] == 0].to_csv("results/edge_information_zero_ICdiff.csv", index=False)

In [69]:
sp_only = df[(df["max_ICdiff_sp"] == 0) & (df["ALL"] > 0)]

In [70]:
sp_only

Unnamed: 0,child,parent,predicate,9606,6239,7955,3702,559292,4896,7227,10090,ALL,child_label,parent_label,average_ICdiff,average_ICdiff_sp,max_ICdiff,max_ICdiff_sp
14,GO:0000015,GO:0004634,RO:0002215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048799,phosphopyruvate hydratase complex,phosphopyruvate hydratase activity,0.005422,0.0,0.048799,0.0
42,GO:0000032,GO:0006057,rdfs:subClassOf,,,,,0.0,0.0,,,0.078305,cell wall mannoprotein biosynthetic process,mannoprotein biosynthetic process,0.026102,0.0,0.078305,0.0
43,GO:0000032,GO:0031506,rdfs:subClassOf,,,,,0.0,0.0,,,0.011450,cell wall mannoprotein biosynthetic process,cell wall glycoprotein biosynthetic process,0.003817,0.0,0.011450,0.0
111,GO:0000088,GO:0000087,BFO:0000050,,,,0.0,,,,,3.954196,mitotic prophase,mitotic M phase,1.977098,0.0,3.954196,0.0
113,GO:0000089,GO:0000087,BFO:0000050,,,,,,,,0.0,1.632268,mitotic metaphase,mitotic M phase,0.816134,0.0,1.632268,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65629,GO:2001229,GO:2001228,rdfs:subClassOf,,,,,,,0.0,0.0,0.415037,negative regulation of response to gamma radia...,regulation of response to gamma radiation,0.138346,0.0,0.415037,0.0
65632,GO:2001230,GO:2001228,rdfs:subClassOf,,0.0,,,,,,,2.000000,positive regulation of response to gamma radia...,regulation of response to gamma radiation,1.000000,0.0,2.000000,0.0
65690,GO:2001250,GO:0045848,rdfs:subClassOf,0.0,,,,,,,0.0,0.459432,positive regulation of ammonia assimilation cycle,positive regulation of nitrogen utilization,0.153144,0.0,0.459432,0.0
65691,GO:2001250,GO:2000213,rdfs:subClassOf,0.0,,,,,,,0.0,1.584963,positive regulation of ammonia assimilation cycle,positive regulation of glutamate metabolic pro...,0.528321,0.0,1.584963,0.0


In [71]:
sp_only.to_csv("results/edge_information_zero_ICdiff_sp.csv", index=False)

## Agreement between species

In [73]:
import pandas as pd
from scipy.spatial.distance import pdist, squareform


# Extract the taxa columns (assuming they are the columns with numeric names)
df = pd.DataFrame(filtered_rows)
taxa_columns = df.columns[3:-2]

# Fill NaN values with column means for distance calculation
df_filled = df[taxa_columns].fillna(df[taxa_columns].mean())

# Calculate pairwise distances between taxa using Pearson correlation distance
pairwise_distances = pdist(df_filled.T, metric='correlation')

# Convert to a square matrix
pairwise_distances_matrix = squareform(pairwise_distances)

# Create a DataFrame for better readability
taxa_pairwise_distances_df = pd.DataFrame(pairwise_distances_matrix, index=taxa_columns, columns=taxa_columns)

# Identify the taxa pairs with the highest disagreement (largest distance)
max_disagreement = taxa_pairwise_distances_df.max().max()
max_disagreement_pair = taxa_pairwise_distances_df.stack().idxmax()

# Display the results
print("Pairwise Distance Matrix:")
print(taxa_pairwise_distances_df)

print(f"The taxa pair with the highest disagreement is: {max_disagreement_pair} with a distance of {max_disagreement}")


Pairwise Distance Matrix:
            9606      6239      7955      3702    559292      4896      7227  \
9606    0.000000  0.521991  0.384054  0.643245  0.615788  0.637006  0.489546   
6239    0.521991  0.000000  0.378123  0.495840  0.478491  0.499582  0.331505   
7955    0.384054  0.378123  0.000000  0.555535  0.549247  0.566367  0.382261   
3702    0.643245  0.495840  0.555535  0.000000  0.481308  0.488853  0.536048   
559292  0.615788  0.478491  0.549247  0.481308  0.000000  0.313815  0.512179   
4896    0.637006  0.499582  0.566367  0.488853  0.313815  0.000000  0.525220   
7227    0.489546  0.331505  0.382261  0.536048  0.512179  0.525220  0.000000   
10090   0.077110  0.530687  0.392722  0.651923  0.627839  0.651838  0.500718   
ALL     0.341423  0.655647  0.562346  0.613442  0.668773  0.679804  0.594839   

           10090       ALL  
9606    0.077110  0.341423  
6239    0.530687  0.655647  
7955    0.392722  0.562346  
3702    0.651923  0.613442  
559292  0.627839  0.668773  