In [82]:
import json
import gzip
import copy


from scipy.stats import fisher_exact
import numpy as np
import pandas as pd

In [88]:
# Parse the gene ontology
with gzip.open("data/go.json.gz") as f:
    ontology = json.load(f)

parents = {}  # { term : list_of_parent_terms }
for edge in ontology["graphs"][0]["edges"]:
    # select only is_a edges
    if edge["pred"] == "is_a":
        parents.setdefault(edge["sub"].split("_")[1], []).append(edge["obj"].split("_")[1])

nodes = []  # list of terms
labels = {}  # { term : definition }
for node in ontology["graphs"][0]["nodes"]:
    # exclude obsolete terms
    if "GO_" in node["id"] and "deprecated" not in node["meta"]:
        nodes.append(node["id"].split("_")[1])
        labels[node["id"].split("_")[1]] = node["lbl"]

print("Total nodes {}\nparsed nodes {}\nnodes with parents {}\n".format(len(ontology["graphs"][0]["nodes"]), len(nodes), len(parents)))

Total nodes 50038
parsed nodes 44650
nodes with parents 44647



In [89]:
roots = set(nodes) - set(parents.keys())
print("Roots: {}\n".format([(r, labels[r]) for r in roots]))

Roots: [('0003674', 'molecular_function'), ('0005575', 'cellular_component'), ('0008150', 'biological_process')]



In [90]:
# Build an ancestors dictionary
ancestors = {}  # { term : list_of_ancestor_terms }
for node in nodes:
    node_ancestors = []
    node_parents = parents.get(node)
    # Loop parent levels until no more parents
    while node_parents:
        node_ancestors.extend(node_parents)
        # Get the parents of current parents (1 level up)
        node_parents = [term for parent in node_parents for term in parents.get(parent, [])]
    ancestors[node] = node_ancestors

In [91]:
# *** Calculate the minimum depth (distance from the root) of each term
depth = {}  # { term : min_depth }
for node in nodes:
    c = 0  # Depth level
    node_parents = parents.get(node)
    while node_parents:
        c += 1
        if roots.intersection(set(node_parents)):  # break the loop if the root is among parents
            break
        # Get the parents of current parents (1 level up)
        node_parents = [term for parent in node_parents for term in parents.get(parent, [])]
    depth[node] = c

In [93]:
depth["0003674"]

0

In [6]:
def gen_block(f):
    """
    Parse and split the input.
    The input must be sorted by target name, second column.

    UniProtKB       A0A024R1R8      hCG_2014768             GO:0002181      PMID:21873635   IBA     PANTHER:PTN002008372|SGD:S000007246     P       HCG2014768, isoform CRA_a       hCG_2014768     protein taxon:9606      20171102        GO_Central
    UniProtKB       A0A024RBG1      NUDT4B          GO:0003723      GO_REF:0000037  IEA     UniProtKB-KW:KW-0694    F       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20191109        UniProt
    UniProtKB       A0A024RBG1      NUDT4B          GO:0005829      GO_REF:0000052  IDA             C       Diphosphoinositol polyphosphate phosphohydrolase NUDT4B NUDT4B  protein taxon:9606      20161204        HPA
    """
    name, old_name = None, None
    chunk = []
    for line in f:
        line = line.decode()
        if line and line[0] != "!":
            _, name, _, _, term, _, ec, _, namespace, protein_name = line.split("\t")[:10]
            term = term[3:]  # remove "GO:" from the term ID
            if name != old_name and old_name:
                yield (old_name, set(chunk))  # return a set as there can be repetitions, i.e. the same term with different evidence codes
                chunk = []
            old_name = name
            chunk.append(term)
    # Last line
    if old_name:
        yield (old_name, set(chunk))

## create a dictionary with all the annotations

In [7]:
protein_to_go = {}  # { protein_id : (GO terms) }
with gzip.open("data/goa_human.gaf.gz") as f:
    for acc, annotations in gen_block(f):
        protein_to_go[acc] = annotations

In [8]:
protein_to_go

{'A0A024R1R8': {'0002181'},
 'A0A024RBG1': {'0003723',
  '0005829',
  '0008486',
  '0046872',
  '0052840',
  '0052842'},
 'A0A075B6H5': {'0005886', '0007166'},
 'A0A075B6H7': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6H8': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6H9': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I0': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I1': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I3': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I4': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I6': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A0A075B6I7': {'0002250',
  '0002377',
  '0005615',
  '0005886',
  '0006955',
  '0019814'},
 'A

In [9]:
for k in protein_to_go:
    print(k)

A0A024R1R8
A0A024RBG1
A0A075B6H5
A0A075B6H7
A0A075B6H8
A0A075B6H9
A0A075B6I0
A0A075B6I1
A0A075B6I3
A0A075B6I4
A0A075B6I6
A0A075B6I7
A0A075B6I9
A0A075B6J1
A0A075B6J2
A0A075B6J6
A0A075B6J9
A0A075B6K0
A0A075B6K2
A0A075B6K4
A0A075B6K5
A0A075B6K6
A0A075B6L2
A0A075B6L6
A0A075B6N1
A0A075B6N2
A0A075B6N3
A0A075B6N4
A0A075B6P5
A0A075B6Q5
A0A075B6R0
A0A075B6R2
A0A075B6R9
A0A075B6S0
A0A075B6S2
A0A075B6S4
A0A075B6S5
A0A075B6S6
A0A075B6S9
A0A075B6T6
A0A075B6T7
A0A075B6T8
A0A075B6U4
A0A075B6V5
A0A075B6W5
A0A075B6X5
A0A075B6Y3
A0A075B6Y9
A0A075B706
A0A075B734
A0A075B759
A0A075B767
A0A075B7B6
A0A075B7B8
A0A075B7D0
A0A075B7D4
A0A075B7D8
A0A075B7E8
A0A075B7F0
A0A075B7F1
A0A087WSX0
A0A087WSY4
A0A087WSY6
A0A087WSZ0
A0A087WSZ9
A0A087WT01
A0A087WT02
A0A087WT03
A0A087WTH1
A0A087WTH5
A0A087WUL8
A0A087WUV0
A0A087WV62
A0A087WVF3
A0A087WVI0
A0A087WVP7
A0A087WW49
A0A087WW87
A0A087WWP6
A0A087WX78
A0A087WXM9
A0A087WXS9
A0A087WYE8
A0A087WZ39
A0A087X0K7
A0A087X0M5
A0A087X179
A0A087X1C1
A0A087X1C5
A0A087X1G2
A0A087X1L8

O14810
O14813
O14815
O14817
O14818
O14827
O14828
O14829
O14830
O14832
O14836
O14841
O14842
O14843
O14862
O14863
O14867
O14874
O14879
O14880
O14893
O14894
O14896
O14901
O14904
O14905
O14907
O14908
O14910
O14917
O14920
O14921
O14924
O14925
O14926
O14929
O14931
O14933
O14936
O14939
O14944
O14948
O14949
O14950
O14957
O14958
O14960
O14964
O14965
O14966
O14967
O14972
O14974
O14975
O14976
O14977
O14978
O14979
O14980
O14981
O14983
O14986
O14990
O14994
O15013
O15014
O15015
O15016
O15018
O15020
O15021
O15027
O15031
O15033
O15034
O15037
O15040
O15041
O15042
O15047
O15049
O15054
O15055
O15056
O15060
O15061
O15062
O15066
O15067
O15068
O15069
O15072
O15075
O15078
O15079
O15083
O15084
O15085
O15090
O15091
O15105
O15111
O15116
O15117
O15118
O15119
O15120
O15121
O15123
O15126
O15127
O15130
O15131
O15143
O15144
O15145
O15146
O15151
O15155
O15156
O15160
O15162
O15164
O15165
O15169
O15173
O15178
O15182
O15194
O15195
O15197
O15198
O15204
O15205
O15209
O15211
O15212
O15213
O15217
O15218
O15225
O15226
O15228

O96009
O96011
O96013
O96014
O96015
O96017
O96018
O96019
O96020
O96024
O96028
O97980
P00156
P00167
P00325
P00326
P00338
P00352
P00367
P00374
P00387
P00390
P00395
P00403
P00414
P00439
P00441
P00450
P00451
P00480
P00488
P00491
P00492
P00505
P00519
P00533
P00540
P00558
P00568
P00709
P00734
P00736
P00738
P00739
P00740
P00742
P00746
P00747
P00748
P00749
P00750
P00751
P00797
P00813
P00846
P00915
P00918
P00966
P00973
P00995
P01008
P01009
P01011
P01019
P01023
P01024
P01031
P01033
P01034
P01036
P01037
P01040
P01042
P01100
P01106
P01111
P01112
P01116
P01127
P01130
P01133
P01135
P01137
P01138
P01148
P01160
P01178
P01185
P01189
P01210
P01213
P01215
P01222
P01225
P01229
P01236
P01241
P01242
P01258
P01266
P01270
P01275
P01282
P01286
P01298
P01303
P01308
P01344
P01350
P01374
P01375
P01562
P01563
P01566
P01567
P01568
P01569
P01570
P01571
P01574
P01579
P01583
P01584
P01588
P01589
P01591
P01593
P01594
P01597
P01599
P01601
P01602
P01611
P01614
P01615
P01619
P01624
P01699
P01700
P01701
P01703
P01704
P01705

P41597
P41732
P41743
P41968
P41970
P42025
P42081
P42126
P42127
P42166
P42224
P42226
P42229
P42261
P42262
P42263
P42285
P42330
P42331
P42336
P42338
P42345
P42356
P42357
P42566
P42568
P42574
P42575
P42658
P42677
P42679
P42680
P42681
P42684
P42685
P42694
P42695
P42696
P42701
P42702
P42704
P42765
P42766
P42768
P42771
P42772
P42773
P42785
P42830
P42857
P42858
P42892
P42898
P43003
P43004
P43005
P43007
P43026
P43034
P43080
P43088
P43115
P43116
P43119
P43121
P43146
P43155
P43166
P43220
P43234
P43235
P43243
P43246
P43250
P43251
P43268
P43304
P43307
P43308
P43320
P43351
P43353
P43354
P43355
P43356
P43357
P43358
P43359
P43360
P43361
P43362
P43363
P43364
P43365
P43378
P43403
P43405
P43487
P43489
P43490
P43626
P43627
P43628
P43629
P43630
P43631
P43632
P43652
P43657
P43681
P43686
P43694
P43699
P43897
P45378
P45379
P45381
P45452
P45844
P45877
P45880
P45954
P45973
P45974
P45983
P45984
P45985
P46013
P46019
P46020
P46059
P46060
P46063
P46087
P46089
P46091
P46092
P46093
P46094
P46095
P46098
P46100
P46108

Q13275
Q13277
Q13278
Q13283
Q13285
Q13286
Q13287
Q13291
Q13296
Q13303
Q13304
Q13308
Q13309
Q13310
Q13315
Q13316
Q13319
Q13322
Q13323
Q13324
Q13325
Q13326
Q13330
Q13332
Q13336
Q13342
Q13347
Q13349
Q13351
Q13352
Q13356
Q13360
Q13361
Q13362
Q13363
Q13367
Q13368
Q13370
Q13371
Q13387
Q13393
Q13394
Q13395
Q13398
Q13401
Q13402
Q13404
Q13405
Q13409
Q13410
Q13415
Q13416
Q13418
Q13421
Q13422
Q13423
Q13424
Q13425
Q13426
Q13427
Q13428
Q13432
Q13433
Q13434
Q13435
Q13438
Q13439
Q13442
Q13443
Q13444
Q13445
Q13449
Q13451
Q13454
Q13459
Q13461
Q13464
Q13467
Q13469
Q13470
Q13472
Q13474
Q13477
Q13478
Q13480
Q13485
Q13487
Q13488
Q13489
Q13490
Q13491
Q13492
Q13495
Q13496
Q13501
Q13503
Q13505
Q13506
Q13507
Q13508
Q13509
Q13510
Q13515
Q13516
Q13519
Q13520
Q13522
Q13523
Q13526
Q13530
Q13535
Q13536
Q13541
Q13542
Q13546
Q13547
Q13554
Q13555
Q13557
Q13561
Q13562
Q13563
Q13564
Q13568
Q13569
Q13571
Q13572
Q13573
Q13574
Q13576
Q13585
Q13586
Q13588
Q13591
Q13595
Q13596
Q13601
Q13606
Q13607
Q13608
Q13609
Q13610
Q13613

Q6DN72
Q6DN90
Q6DRA6
Q6DT37
Q6DWJ6
Q6E0U4
Q6E213
Q6EBC2
Q6ECI4
Q6EIG7
Q6EKJ0
Q6EMB2
Q6EMK4
Q6F5E8
Q6FHJ7
Q6FI13
Q6FI81
Q6FIF0
Q6GMR7
Q6GMV1
Q6GMV2
Q6GMV3
Q6GPH4
Q6GPH6
Q6GPI1
Q6GQQ9
Q6GTS8
Q6GTX8
Q6GV28
Q6GYQ0
Q6H3X3
Q6H8Q1
Q6H9L7
Q6HA08
Q6I9Y2
Q6IA17
Q6IA69
Q6IA86
Q6IAA8
Q6IAN0
Q6IB77
Q6IBS0
Q6IBW4
Q6IC98
Q6ICB0
Q6ICB4
Q6ICG6
Q6ICG8
Q6ICH7
Q6ICI0
Q6ICL3
Q6ICL7
Q6IE36
Q6IE37
Q6IE38
Q6IE81
Q6IED9
Q6IEE7
Q6IEE8
Q6IEG0
Q6IEU7
Q6IEV9
Q6IEY1
Q6IEZ7
Q6IF00
Q6IF36
Q6IF42
Q6IF63
Q6IF82
Q6IF99
Q6IFG1
Q6IFH4
Q6IFN5
Q6IMI4
Q6IMI6
Q6IMN6
Q6IN84
Q6IN85
Q6IN97
Q6IPM2
Q6IPR1
Q6IPR3
Q6IPT4
Q6IPU0
Q6IPX1
Q6IPX3
Q6IQ16
Q6IQ19
Q6IQ20
Q6IQ21
Q6IQ22
Q6IQ23
Q6IQ26
Q6IQ32
Q6IQ49
Q6IQ55
Q6IS14
Q6IS24
Q6ISB3
Q6ISS4
Q6ISU1
Q6IV72
Q6IWH7
Q6J272
Q6J4K2
Q6J9G0
Q6JBY9
Q6JEL2
Q6JQN1
Q6JVE5
Q6JVE6
Q6JVE9
Q6K0P9
Q6KB66
Q6KC79
Q6KCM7
Q6KF10
Q6L8G4
Q6L8G5
Q6L8G8
Q6L8G9
Q6L8H1
Q6L8H2
Q6L8H4
Q6L8Q7
Q6L9T8
Q6L9W6
Q6MZM0
Q6MZM9
Q6MZP7
Q6MZQ0
Q6MZT1
Q6MZW2
Q6MZZ7
Q6N021
Q6N022
Q6N043
Q6N063
Q6N069
Q6N075
Q6NS38
Q6NSI1
Q6NSI3

Q86SG2
Q86SG3
Q86SG5
Q86SG6
Q86SG7
Q86SH2
Q86SH4
Q86SI9
Q86SJ2
Q86SJ6
Q86SK9
Q86SM5
Q86SM8
Q86SP6
Q86SQ0
Q86SQ3
Q86SQ4
Q86SQ6
Q86SQ7
Q86SQ9
Q86SR1
Q86SS6
Q86SU0
Q86SX3
Q86SX6
Q86SZ2
Q86T03
Q86T13
Q86T20
Q86T23
Q86T24
Q86T26
Q86T29
Q86T65
Q86T75
Q86T82
Q86T90
Q86T96
Q86TA1
Q86TB3
Q86TB9
Q86TC9
Q86TD4
Q86TE4
Q86TG1
Q86TG7
Q86TH1
Q86TI0
Q86TI2
Q86TJ2
Q86TJ5
Q86TL0
Q86TL2
Q86TM3
Q86TM6
Q86TN4
Q86TP1
Q86TS9
Q86TU7
Q86TV6
Q86TW2
Q86TX2
Q86TY3
Q86U02
Q86U06
Q86U10
Q86U17
Q86U28
Q86U38
Q86U42
Q86U44
Q86U70
Q86U86
Q86U90
Q86UA1
Q86UA6
Q86UB2
Q86UB9
Q86UC2
Q86UD0
Q86UD3
Q86UD4
Q86UD5
Q86UD7
Q86UE3
Q86UE4
Q86UE6
Q86UE8
Q86UF1
Q86UF2
Q86UG4
Q86UK0
Q86UK5
Q86UK7
Q86UL3
Q86UL8
Q86UN2
Q86UN3
Q86UN6
Q86UP0
Q86UP2
Q86UP3
Q86UP6
Q86UP8
Q86UP9
Q86UQ0
Q86UQ4
Q86UQ5
Q86UQ8
Q86UR1
Q86UR5
Q86US8
Q86UT5
Q86UT6
Q86UT8
Q86UU0
Q86UU1
Q86UU5
Q86UU9
Q86UV5
Q86UV6
Q86UV7
Q86UW1
Q86UW2
Q86UW6
Q86UW7
Q86UW8
Q86UW9
Q86UX2
Q86UX6
Q86UX7
Q86UY5
Q86UY6
Q86UY8
Q86UZ6
Q86V15
Q86V20
Q86V21
Q86V24
Q86V25
Q86V35
Q86V40
Q86V42

Q8NC56
Q8NC60
Q8NC67
Q8NC69
Q8NC74
Q8NC96
Q8NCA5
Q8NCA9
Q8NCB2
Q8NCC3
Q8NCC5
Q8NCD3
Q8NCE0
Q8NCE2
Q8NCF0
Q8NCF5
Q8NCG5
Q8NCG7
Q8NCH0
Q8NCI6
Q8NCK3
Q8NCK7
Q8NCL4
Q8NCL8
Q8NCL9
Q8NCM2
Q8NCM8
Q8NCN2
Q8NCN4
Q8NCN5
Q8NCP5
Q8NCQ3
Q8NCQ5
Q8NCQ7
Q8NCR0
Q8NCR3
Q8NCR6
Q8NCR9
Q8NCS4
Q8NCS7
Q8NCT1
Q8NCU1
Q8NCU7
Q8NCU8
Q8NCV1
Q8NCW0
Q8NCW5
Q8NCW6
Q8NCX0
Q8NCY6
Q8ND04
Q8ND07
Q8ND23
Q8ND24
Q8ND25
Q8ND30
Q8ND56
Q8ND61
Q8ND71
Q8ND76
Q8ND82
Q8ND83
Q8ND90
Q8ND94
Q8NDA2
Q8NDB2
Q8NDB6
Q8NDC0
Q8NDC4
Q8NDD1
Q8NDF8
Q8NDG6
Q8NDH3
Q8NDH6
Q8NDI1
Q8NDL9
Q8NDM7
Q8NDN9
Q8NDP4
Q8NDQ6
Q8NDT2
Q8NDV1
Q8NDV2
Q8NDV3
Q8NDV7
Q8NDW4
Q8NDW8
Q8NDX1
Q8NDX2
Q8NDX5
Q8NDX6
Q8NDX9
Q8NDY3
Q8NDY4
Q8NDY6
Q8NDY8
Q8NDZ2
Q8NDZ4
Q8NDZ6
Q8NE00
Q8NE01
Q8NE09
Q8NE18
Q8NE22
Q8NE28
Q8NE31
Q8NE35
Q8NE62
Q8NE63
Q8NE65
Q8NE71
Q8NE79
Q8NE86
Q8NEA5
Q8NEA6
Q8NEA9
Q8NEB5
Q8NEB7
Q8NEB9
Q8NEC5
Q8NEC7
Q8NEE6
Q8NEF3
Q8NEF9
Q8NEG0
Q8NEG5
Q8NEG7
Q8NEH6
Q8NEJ0
Q8NEJ9
Q8NEK5
Q8NEK8
Q8NEL9
Q8NEM0
Q8NEM1
Q8NEM2
Q8NEM7
Q8NEM8
Q8NEN0
Q8NEN9
Q8NEP3
Q8NEP7
Q8NEP9

Q92879
Q92882
Q92886
Q92887
Q92888
Q92889
Q92890
Q92896
Q92900
Q92901
Q92902
Q92903
Q92904
Q92905
Q92908
Q92911
Q92913
Q92914
Q92915
Q92917
Q92918
Q92922
Q92925
Q92928
Q92930
Q92932
Q92934
Q92935
Q92945
Q92947
Q92949
Q92952
Q92953
Q92954
Q92956
Q92959
Q92963
Q92966
Q92968
Q92973
Q92974
Q92979
Q92982
Q92985
Q92988
Q92989
Q92990
Q92993
Q92994
Q92995
Q92997
Q93008
Q93009
Q93015
Q93033
Q93034
Q93038
Q93045
Q93050
Q93052
Q93062
Q93063
Q93070
Q93073
Q93074
Q93075
Q93077
Q93079
Q93083
Q93084
Q93086
Q93088
Q93091
Q93096
Q93097
Q93098
Q93099
Q93100
Q95460
Q969D9
Q969E1
Q969E2
Q969E3
Q969E4
Q969E8
Q969F0
Q969F1
Q969F2
Q969F8
Q969F9
Q969G2
Q969G3
Q969G5
Q969G6
Q969G9
Q969H0
Q969H4
Q969H6
Q969H8
Q969I3
Q969I6
Q969J2
Q969J3
Q969J5
Q969K3
Q969K4
Q969K7
Q969L2
Q969L4
Q969M1
Q969M2
Q969M3
Q969M7
Q969N2
Q969N4
Q969P0
Q969P5
Q969P6
Q969Q0
Q969Q1
Q969Q4
Q969Q5
Q969Q6
Q969R2
Q969R5
Q969R8
Q969S0
Q969S2
Q969S3
Q969S6
Q969S8
Q969S9
Q969T3
Q969T4
Q969T7
Q969T9
Q969U6
Q969U7
Q969V1
Q969V3
Q969V4
Q969V5
Q969V6

Q96RJ0
Q96RJ3
Q96RJ6
Q96RK0
Q96RK1
Q96RK4
Q96RL1
Q96RL6
Q96RL7
Q96RM1
Q96RN1
Q96RN5
Q96RP3
Q96RP7
Q96RP8
Q96RP9
Q96RQ1
Q96RQ3
Q96RQ9
Q96RR1
Q96RR4
Q96RS0
Q96RS6
Q96RT1
Q96RT6
Q96RT7
Q96RT8
Q96RU2
Q96RU3
Q96RU7
Q96RU8
Q96RV3
Q96RW7
Q96RY5
Q96RY7
Q96S06
Q96S15
Q96S16
Q96S21
Q96S37
Q96S38
Q96S42
Q96S44
Q96S52
Q96S53
Q96S55
Q96S59
Q96S65
Q96S66
Q96S79
Q96S82
Q96S86
Q96S90
Q96S94
Q96S95
Q96S96
Q96S97
Q96S99
Q96SA4
Q96SB3
Q96SB4
Q96SB8
Q96SC8
Q96SD1
Q96SE0
Q96SE7
Q96SF2
Q96SF7
Q96SI1
Q96SI9
Q96SJ8
Q96SK2
Q96SK3
Q96SL1
Q96SL4
Q96SL8
Q96SM3
Q96SN7
Q96SN8
Q96SQ5
Q96SQ7
Q96SQ9
Q96SR6
Q96ST2
Q96ST3
Q96ST8
Q96SU4
Q96SW2
Q96SY0
Q96SZ4
Q96SZ5
Q96SZ6
Q96T17
Q96T21
Q96T23
Q96T25
Q96T37
Q96T49
Q96T51
Q96T52
Q96T53
Q96T54
Q96T55
Q96T58
Q96T60
Q96T66
Q96T68
Q96T75
Q96T76
Q96T83
Q96T88
Q96T91
Q96T92
Q96TA0
Q96TA1
Q96TA2
Q96TC7
Q99062
Q99075
Q99081
Q99102
Q99217
Q99218
Q99250
Q99259
Q99417
Q99418
Q99424
Q99426
Q99435
Q99436
Q99437
Q99439
Q99440
Q99442
Q99445
Q99447
Q99453
Q99456
Q99457
Q99459
Q99460
Q99463

Q9H3S4
Q9H3S5
Q9H3S7
Q9H3T2
Q9H3T3
Q9H3U1
Q9H3U5
Q9H3U7
Q9H3V2
Q9H3W5
Q9H3Y0
Q9H3Y6
Q9H3Y8
Q9H3Z4
Q9H3Z7
Q9H400
Q9H410
Q9H422
Q9H425
Q9H426
Q9H427
Q9H444
Q9H446
Q9H461
Q9H467
Q9H469
Q9H477
Q9H478
Q9H479
Q9H488
Q9H489
Q9H490
Q9H492
Q9H497
Q9H4A3
Q9H4A4
Q9H4A5
Q9H4A6
Q9H4A9
Q9H4B0
Q9H4B4
Q9H4B6
Q9H4B7
Q9H4B8
Q9H4D0
Q9H4D5
Q9H4E5
Q9H4E7
Q9H4F1
Q9H4F8
Q9H4G0
Q9H4G1
Q9H4G4
Q9H4G8
Q9H4H8
Q9H4I0
Q9H4I2
Q9H4I8
Q9H4I9
Q9H4K1
Q9H4K7
Q9H4L4
Q9H4L5
Q9H4L7
Q9H4M3
Q9H4M7
Q9H4M9
Q9H4P4
Q9H4Q3
Q9H4Q4
Q9H4R4
Q9H4S2
Q9H4T2
Q9H4W6
Q9H4X1
Q9H4Y5
Q9H4Z2
Q9H4Z3
Q9H501
Q9H503
Q9H511
Q9H521
Q9H553
Q9H568
Q9H582
Q9H583
Q9H596
Q9H598
Q9H5F2
Q9H5H4
Q9H5I1
Q9H5I5
Q9H5J0
Q9H5J4
Q9H5J8
Q9H5K3
Q9H5L6
Q9H5N1
Q9H5P4
Q9H5Q4
Q9H5U6
Q9H5V7
Q9H5V8
Q9H5V9
Q9H5X1
Q9H5Y7
Q9H5Z1
Q9H5Z6
Q9H609
Q9H611
Q9H628
Q9H633
Q9H649
Q9H665
Q9H668
Q9H672
Q9H694
Q9H6A0
Q9H6A9
Q9H6B1
Q9H6B4
Q9H6B9
Q9H6D3
Q9H6D7
Q9H6D8
Q9H6E4
Q9H6E5
Q9H6F2
Q9H6F5
Q9H6H4
Q9H6I2
Q9H6J7
Q9H6K1
Q9H6K4
Q9H6L2
Q9H6L4
Q9H6L5
Q9H6N6
Q9H6P5
Q9H6Q3
Q9H6Q4
Q9H6R0
Q9H6R3

Q9UKI8
Q9UKI9
Q9UKJ0
Q9UKJ1
Q9UKJ3
Q9UKJ5
Q9UKJ8
Q9UKK3
Q9UKK6
Q9UKK9
Q9UKL0
Q9UKL2
Q9UKL3
Q9UKL4
Q9UKL6
Q9UKM7
Q9UKM9
Q9UKN1
Q9UKN5
Q9UKN7
Q9UKN8
Q9UKP3
Q9UKP4
Q9UKP5
Q9UKP6
Q9UKQ2
Q9UKQ9
Q9UKR0
Q9UKR3
Q9UKR5
Q9UKR8
Q9UKS6
Q9UKS7
Q9UKT4
Q9UKT5
Q9UKT6
Q9UKT7
Q9UKT8
Q9UKT9
Q9UKU0
Q9UKU6
Q9UKU7
Q9UKU9
Q9UKV0
Q9UKV3
Q9UKV5
Q9UKV8
Q9UKW4
Q9UKW6
Q9UKX2
Q9UKX3
Q9UKX5
Q9UKX7
Q9UKY0
Q9UKY1
Q9UKY3
Q9UKY4
Q9UKY7
Q9UKZ1
Q9UKZ4
Q9UKZ9
Q9UL01
Q9UL03
Q9UL12
Q9UL15
Q9UL16
Q9UL17
Q9UL18
Q9UL19
Q9UL25
Q9UL26
Q9UL33
Q9UL36
Q9UL40
Q9UL41
Q9UL42
Q9UL45
Q9UL46
Q9UL49
Q9UL51
Q9UL52
Q9UL54
Q9UL58
Q9UL59
Q9UL62
Q9UL63
Q9UL68
Q9ULA0
Q9ULB1
Q9ULB4
Q9ULB5
Q9ULC0
Q9ULC3
Q9ULC4
Q9ULC5
Q9ULC6
Q9ULC8
Q9ULD0
Q9ULD2
Q9ULD4
Q9ULD5
Q9ULD6
Q9ULD8
Q9ULD9
Q9ULE0
Q9ULE3
Q9ULE6
Q9ULF5
Q9ULG1
Q9ULG6
Q9ULH0
Q9ULH1
Q9ULH4
Q9ULH7
Q9ULI0
Q9ULI2
Q9ULI3
Q9ULI4
Q9ULJ1
Q9ULJ3
Q9ULJ6
Q9ULJ7
Q9ULJ8
Q9ULK0
Q9ULK2
Q9ULK4
Q9ULK5
Q9ULK6
Q9ULL0
Q9ULL1
Q9ULL4
Q9ULL5
Q9ULL8
Q9ULM0
Q9ULM2
Q9ULM3
Q9ULM6
Q9ULP0
Q9ULP9
Q9ULQ0
Q9ULQ1
Q9ULR0
Q9ULR3

In [17]:
with open("data/original.txt") as f:
    dataset = f.read().splitlines() 

In [18]:
with open("data/swiss-human-id.txt") as f:
    background_dataset = f.read().splitlines() 

## lets check if all the proteins in the dataset are annotated

In [19]:
len(background_dataset)

20367

In [20]:
len(dataset)

92

In [21]:
len(set(protein_to_go.keys()))

19473

In [22]:
len(set(background_dataset).intersection(set(dataset)))

92

In [23]:
len(set(protein_to_go.keys()).intersection(set(dataset)))

91

## orko can one is missing 

# WHO?

In [24]:
set(dataset) - set(protein_to_go.keys()).intersection(set(dataset))

{'Q8TC17'}

## thiz fucker

<br>
<br>
<br>
<br>




# Enrichment

In [55]:
def count_ancestors(protein_list):
    counts = {}

    for protein in set(protein_list).intersection(set(protein_to_go.keys())):
        annotations = protein_to_go[protein]

        terms_ancestors = copy.copy(annotations)  # annotations + ancestor terms
        for term in annotations:  # directly annotated terms
            terms_ancestors.update(set(ancestors.get(term, [])))  # add ancestors
        for term in terms_ancestors:
            counts.setdefault(term, 0)
            counts[term] += 1
        
    return counts

In [61]:
dataset_count = count_ancestors(dataset)
background_count = count_ancestors(background_dataset)

In [62]:
print(len(dataset_count), len(background_count), len(set(dataset_count.keys()).intersection(set(background_count.keys()))))

2591 22298 2591


In [73]:
test = {"a":1, "b":10, "c":5}

In [74]:
np.sum(list(test.values()))

16

In [107]:
# imput: two dict of the type {GO_term: count} and a dict of depth
def fisher_test(d_count, bg_count, depth):
    
    # Init result dict
    results = {}
    
    # Get the tot number of counts
    tot_d = np.sum(list(d_count.values()))
    tot_bg = np.sum(list(bg_count.values()))
    
    
    key_intersection = set(d_count.keys()).intersection(set(bg_count.keys()))
    
    for key in key_intersection:
        ### 1. Set frequencies
        # Number of occurrences of the specific GO term in d_count   
        a = d_count[key]
        # Number of occurrences of the specific GO term in bg_count
        b = bg_count[key]
        # Number of GO terms that are different from the specific one in d_count
        not_a = tot_d - a
        # Number of GO terms that are different from the specific one in bg_count
        not_b = tot_bg - b
        # 2. Perform Fisher Exact Test
        fisher_results = fisher_exact([[a, b],[not_a, not_b]])
        # 3. Save results
        results.setdefault(key, {'OddRatio': fisher_results[0], 'p-value': fisher_results[1],
                                'depth': depth[key]})
    
    # Return the DataFrame
    return pd.DataFrame(results).transpose()

In [108]:
pesce = fisher_test(dataset_count, background_count, depth)

In [109]:
pesce.sort_values(by = "OddRatio", ascending=False, inplace=True)

In [122]:
pesce.head(50)

Unnamed: 0,OddRatio,p-value,depth
35685,102.965166,0.0005481139,4.0
33277,102.965166,0.0005481139,4.0
16170,102.958708,0.01914707,5.0
97699,102.958708,0.01914707,5.0
32752,102.958708,0.01914707,6.0
1990859,102.958708,0.01914707,6.0
1904618,102.958708,0.01914707,6.0
45399,102.958708,0.01914707,6.0
8269,102.958708,0.01914707,5.0
1903674,102.958708,0.01914707,9.0


In [118]:
max_depth = pesce.depth <= 2

In [119]:
pesce[max_depth]

Unnamed: 0,OddRatio,p-value,depth
0001545,102.958708,1.914707e-02,2.0
0002339,34.319527,3.792755e-02,2.0
0035426,25.739630,4.718252e-02,2.0
0031294,22.894917,1.904113e-12,2.0
0030061,14.708333,7.441668e-02,2.0
...,...,...,...
0005815,0.155464,2.574858e-02,2.0
0051606,0.151347,1.779443e-02,2.0
0007275,0.138695,1.280081e-02,2.0
0038023,0.068944,1.464697e-05,2.0
