In [20]:
#!pip install statsmodels

In [21]:
import os
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
import plotly.express as px

In [2]:
data_dir = '../data'
predictions_path = os.path.join(data_dir, 'predictions.tsv')
gos_path = os.path.join(data_dir, 'gos.tsv')

In [3]:
predictions = pd.read_csv(predictions_path, sep='\t')
predictions.head()

Unnamed: 0,taxid1,taxid1_label,source_color,source_shape,source,source_name,taxid2,taxid2_label,target_color,target_shape,target,target_name,experimental_evidence_score,databases_evidence_score,weight,group1,group2,edge_type
0,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000339740,CAMK2D,0.0,0.77,0.385,KOG0039,KOG0033,inter-species
1,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000362057,NOX1,0.0,0.77,0.385,KOG0039,KOG0033,inter-species
2,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000475084,DUOX2,0.0,0.77,0.385,KOG0039,KOG0033,inter-species
3,5671,Leishmania infantum,#e31a1c,diamond,5671.XP_001467017.1,XP_001467017.1,9606,Homo sapiens,#525252,dot,9606.ENSP00000339740,CAMK2D,0.994,0.0,0.497,KOG0078,KOG0033,inter-species
4,5671,Leishmania infantum,#e31a1c,diamond,5671.XP_001467017.1,XP_001467017.1,9606,Homo sapiens,#525252,dot,9606.ENSP00000300935,RAB8A,0.994,0.0,0.497,KOG0078,KOG0033,inter-species


In [4]:
gos = pd.read_csv(gos_path, sep='\t')
gos.head()

Unnamed: 0,#string_protein_id,description,taxid
0,9606.ENSP00000000233,Transport,9606
1,9606.ENSP00000000233,Intracellular protein transport,9606
2,9606.ENSP00000000233,"Retrograde vesicle-mediated transport, golgi t...",9606
3,9606.ENSP00000000233,Protein localization,9606
4,9606.ENSP00000000233,Cellular process,9606


In [5]:
net = predictions[(predictions['taxid1'].isin([5691, 9606])) & (predictions['weight'] >=0.4)]
net.head()

Unnamed: 0,taxid1,taxid1_label,source_color,source_shape,source,source_name,taxid2,taxid2_label,target_color,target_shape,target,target_name,experimental_evidence_score,databases_evidence_score,weight,group1,group2,edge_type
74,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000263025,MAPK3,0.354,0.866,0.61,KOG0039,KOG0660,inter-species
75,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000215832,MAPK1,0.354,0.866,0.61,KOG0039,KOG0660,inter-species
109,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000315768,STAT2,0.259,0.866,0.5625,KOG0039,KOG3667,inter-species
110,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000264657,STAT3,0.259,0.866,0.5625,KOG0039,KOG3667,inter-species
121,5691,Trypanosoma brucei,#bc80bd,diamond,5691.EAN79407,EAN79407,9606,Homo sapiens,#525252,dot,9606.ENSP00000360293,SORBS1,0.467,0.866,0.6665,KOG0039,KOG4225,inter-species


In [6]:
net_gos = gos[gos['taxid'].isin([5691, 9606])].groupby('description').filter(lambda x: len(x)< 500)
net_gos.head()

Unnamed: 0,#string_protein_id,description,taxid
2,9606.ENSP00000000233,"Retrograde vesicle-mediated transport, golgi t...",9606
13,9606.ENSP00000000233,Golgi vesicle transport,9606
21,9606.ENSP00000000412,Peptide secretion,9606
22,9606.ENSP00000000412,Protein targeting,9606
23,9606.ENSP00000000412,Protein targeting to lysosome,9606


In [7]:
nodes = net['source'].unique().tolist() + net['target'].unique().tolist()

In [8]:
total_nodes = len(nodes)

In [9]:
#C'
total_nodes

274

In [10]:
selected_gos = net_gos[net_gos['#string_protein_id'].isin(nodes)].groupby('description').filter(lambda x: len(x)> 10)['description'].unique().tolist()

In [11]:
len(selected_gos)

280

In [12]:
total_prots = len(net_gos['#string_protein_id'].unique().tolist())

In [13]:
#G
total_prots

17858

In [15]:
enrichment = []
for term in selected_gos:   
    members = net_gos[(net_gos['description'] == term)]['#string_protein_id']
    #E
    total_members = len(members)
    net_members = net_gos[(net_gos['description'] == term) & (net_gos['#string_protein_id'].isin(nodes))]['#string_protein_id']
    #A
    total_net_members = len(net_members)
    
    odd_ratio, p_value = stats.fisher_exact([[total_net_members, total_nodes - total_net_members],
                                             [total_members - total_net_members, total_prots - total_members - total_nodes - total_net_members]])
    enrichment.append([term, total_net_members, total_nodes - total_net_members, total_members - total_net_members,  total_prots - total_members - total_nodes - total_net_members, p_value, odd_ratio, ','.join(net_members)])

In [16]:
enrichment = pd.DataFrame(enrichment, columns=['go_term', 'A', 'B', 'C', 'D', 'p_value', 'odds', 'nodes'])
enrichment['fdr_bh'] = multipletests(enrichment['p_value'].tolist(), alpha=0.01, method='fdr_bh')[1]
enrichment.head()

Unnamed: 0,go_term,A,B,C,D,p_value,odds,nodes,fdr_bh
0,Eye development,15,259,350,17204,0.000553,2.846773,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.000711
1,Regulation of cellular component size,19,255,376,17170,1.2e-05,3.402482,"9606.ENSP00000005226,9606.ENSP00000229264,9606...",2.4e-05
2,Camera-type eye development,12,262,306,17254,0.003881,2.582548,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.004436
3,Embryonic organ development,13,261,437,17121,0.029363,1.951419,"9606.ENSP00000005226,9606.ENSP00000215832,9606...",0.030678
4,Sensory system development,15,259,356,17198,0.000654,2.797818,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.000825


In [17]:
enrichment.shape

(280, 9)

In [18]:
enrichment[enrichment['fdr_bh']<0.01]

Unnamed: 0,go_term,A,B,C,D,p_value,odds,nodes,fdr_bh
0,Eye development,15,259,350,17204,0.000553,2.846773,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.000711
1,Regulation of cellular component size,19,255,376,17170,0.000012,3.402482,"9606.ENSP00000005226,9606.ENSP00000229264,9606...",0.000024
2,Camera-type eye development,12,262,306,17254,0.003881,2.582548,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.004436
4,Sensory system development,15,259,356,17198,0.000654,2.797818,"9606.ENSP00000005226,9606.ENSP00000232461,9606...",0.000825
5,Sensory organ morphogenesis,11,263,253,17309,0.002658,2.861465,"9606.ENSP00000005226,9606.ENSP00000215832,9606...",0.003075
...,...,...,...,...,...,...,...,...,...
272,Regulation of reactive oxygen species metaboli...,12,262,176,17384,0.000035,4.523942,"9606.ENSP00000264657,9606.ENSP00000275493,9606...",0.000061
273,Response to insulin,11,263,221,17341,0.000956,3.281851,"9606.ENSP00000264710,9606.ENSP00000278568,9606...",0.001174
274,Cellular response to insulin stimulus,11,263,159,17403,0.000066,4.577875,"9606.ENSP00000264710,9606.ENSP00000278568,9606...",0.000107
278,Regulation of dendrite development,11,263,145,17417,0.000030,5.023915,"9606.ENSP00000286827,9606.ENSP00000292385,9606...",0.000053


In [23]:
px.scatter(enrichment[enrichment['fdr_bh']<0.01], x='fdr_bh', y='odds', size='odds', color='go_term')

In [28]:
fig = px.treemap(enrichment[enrichment['fdr_bh']<0.01], path=['go_term', 'nodes'], values='odds', width=2000, height=1200)

In [29]:
fig