In [12]:
#import the module
import EdgePrediction, json
import pandas as pd

In [2]:
from scipy.stats import hypergeom

In [3]:
def setup(files, optimisation, require_all):
    #create a new instance
    ep = EdgePrediction.EdgePrediction()
    
    ep.to_predict = 'risk_gene'
    ep.optimisation_method = optimisation
    ep.require_all_predictors = require_all
    
    for to_load in files:
        print(to_load)
        ep.CSV_to_graph(fname = to_load)
    
    nw_order = list(ep.graphs.keys()).sort()
    ep.network_order = nw_order
    ep.preprocess()
    
    return ep

In [4]:
def hyper_prob_at_least(pop_n, pop_true, draw_n, draw_true):
    #prob of at least h hits is i - cdf(h-1)
    prb = hypergeom.cdf(draw_true-1, pop_n, pop_true, draw_n)
    prb_h_plus = 1 - prb
    return prb_h_plus

In [5]:
def make_row(result, target_name, predictable, n_new_genes_total, verbose=True):
    row = {}
    a = set(result['new_hits'])
    n_validated = len(a.intersection(predictable))
    n_predicted = len(a)
    n_can_validate = len(predictable)
    if n_predicted > 0:
        validation_precision = n_validated / n_predicted
    else:
        validation_precision = 0.0
    validation_recall = n_validated / n_can_validate
    n_genes_total = sum(result['contingency'].values())
    n_genes_predicted_total = result['contingency']['tp'] + result['contingency']['fp']
    n_genes_train = result['contingency']['tp'] + result['contingency']['fn']
    
    keep = ['F1', 'F05', 'F2', 'ACC', 'J', 'PREC', 'REC', 'FDR', 'FPR', 'hits_total', 'hits_new', 'hits_known', 'model_built']
    row = {x:result[x] for x in keep}
    for x in ['tp', 'tn', 'fp', 'fn']:
        row[x] = result['contingency'][x]
    for name, weight in result['weights'].items():
        row['weight_' + name] = weight
    row['target'] = target_name
    
    row['validation_precision'] = validation_precision
    row['validation_recall'] = validation_recall
    row['n_genes_total'] = n_genes_total
    row['n_new_genes_total'] = n_new_genes_total
    row['n_new_genes_predictable'] = n_can_validate
    row['model_built'] = True
    row['n_validated'] = n_validated
    
    #hyper_prob_at_least(pop_n, pop_true, draw_n, draw_true)
    pop_n = result['contingency']['tn'] + result['contingency']['fp']
    pop_true = n_can_validate
    draw_n = n_predicted #this was tp+fp before but should only be fp i.e. new_hits
    draw_true = n_validated
    prob = hyper_prob_at_least(pop_n, pop_true, draw_n, draw_true)
    row['prob'] = prob
    
    if verbose:
        print("model for {} achieved {:.2f} validation rate at {:.2f} validation recall (found {}/{} with pred={})".format(target_name, validation_precision, validation_recall, n_validated, n_can_validate, n_predicted))
        print("predicted {}/{} genes associated with {} training genes".format(n_genes_predicted_total, n_genes_total, n_genes_train))
        print("predicted association rate {:.3f}".format(n_genes_predicted_total/n_genes_total))
        print("{}/{} new associations could have been predicted and were tested".format(n_can_validate, n_new_genes_total))
        print("p hyper {:0.3f}".format(prob))
        print()
    
    return row
    

In [6]:
def run(ep, to_train, valid):
    all_genes = set(ep.graphs['risk_gene']['sourcenodes'])
    rows = []
    n_todo = len(to_train)
    for tgt_idx in range(len(to_train)):
        print("{}/{}".format(tgt_idx, n_todo))
        target_name = to_train[tgt_idx]
        #check if any of the new associations can potentially be predicted
        new_genes = set(valid[target_name])
        predictable = new_genes.intersection(all_genes)
        if len(predictable) == 0:
            print("**** no predictable genes for", target_name)
            row = {'target': target_name, 'n_new_genes_predictable': 0, 'n_new_genes_total': len(new_genes)}
            rows.append(row)
            continue

        result = ep.predict(target = target_name, calculate_auc = True)
        if result['model_built'] == True:
            row = make_row(result, target_name, predictable, len(new_genes))
        else:
            row = {'target': target_name, 'n_new_genes_predictable': len(predictable), 'n_new_genes_total': len(new_genes)}
            row['model_built'] = False
        rows.append(row)
    return rows

In [7]:
files = [
         "train/disgenet_v7.0_train_2018.csv",
         "train/Gene Ontology v2020-11-17.csv",
         "train/IntAct v2020-11-06.csv"
         ]

In [8]:
with open('test/disgenet_new_since_2018.txt') as f:
    valid = json.load(f)

In [9]:
ep = setup(files, optimisation = 'graph_sparse', require_all=False)

train/disgenet_v7.0_train_2018.csv
train/Gene Ontology v2020-11-17.csv
train/IntAct v2020-11-06.csv
8107 source nodes are common to all 3 input graphs
deleting 1002 nodes that don't overlap between networks
deleting another 271 nodes that now have degree zero
deleting 11604 nodes that don't overlap between networks
deleting another 1942 nodes that now have degree zero
deleting 13358 nodes that don't overlap between networks
deleting another 2201 nodes that now have degree zero


In [10]:
possible_targets = set(ep.graphs['risk_gene']['targetnodes'])
print(len(possible_targets), 'diseases in network')

10329 diseases in network


In [11]:
to_train_tmp = list(valid.keys())
to_train = [x for x in to_train_tmp if x in possible_targets]

Train all models. This can take ~ 2 hours.

In [68]:
rows = run(ep, to_train, valid)

0/1131
**** no predictable genes for C0036341
1/1131
ignoring C0002395 as a predictor in network risk_gene
model for C0002395 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=1095)
predicted 1182.0/8107.0 genes associated with 88 training genes
predicted association rate 0.146
1/1 new associations could have been predicted and were tested
p hyper 0.137

2/1131
ignoring C0007102 as a predictor in network risk_gene
model for C0007102 achieved 0.00 validation rate at 0.33 validation recall (found 1/3 with pred=1160)
predicted 1274.0/8107.0 genes associated with 147 training genes
predicted association rate 0.157
3/5 new associations could have been predicted and were tested
p hyper 0.377

3/1131
ignoring C0009375 as a predictor in network risk_gene
model for C0009375 achieved 0.00 validation rate at 0.50 validation recall (found 2/4 with pred=1009)
predicted 1113.0/8107.0 genes associated with 140 training genes
predicted association rate 0.137
4/5 new associat

ignoring C0006142 as a predictor in network risk_gene
model for C0006142 achieved 0.01 validation rate at 0.13 validation recall (found 19/144 with pred=1522)
predicted 2048.0/8107.0 genes associated with 667 training genes
predicted association rate 0.253
144/344 new associations could have been predicted and were tested
p hyper 0.992

30/1131
ignoring C0011999 as a predictor in network risk_gene
model for C0011999 achieved 0.03 validation rate at 0.33 validation recall (found 1/3 with pred=29)
predicted 52.0/8107.0 genes associated with 23 training genes
predicted association rate 0.006
3/3 new associations could have been predicted and were tested
p hyper 0.011

31/1131
ignoring C0019193 as a predictor in network risk_gene
model for C0019193 achieved 0.00 validation rate at 0.75 validation recall (found 3/4 with pred=1563)
predicted 1833.0/8107.0 genes associated with 270 training genes
predicted association rate 0.226
4/4 new associations could have been predicted and were tested
p

ignoring C0393770 as a predictor in network risk_gene
model for C0393770 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

55/1131
ignoring C0494410 as a predictor in network risk_gene
model for C0494410 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

56/1131
ignoring C0751758 as a predictor in network risk_gene
model for C0751758 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

57/1131
igno

ignoring C4721507 as a predictor in network risk_gene
model for C4721507 achieved 0.01 validation rate at 0.67 validation recall (found 2/3 with pred=374)
predicted 433.0/8107.0 genes associated with 67 training genes
predicted association rate 0.053
3/4 new associations could have been predicted and were tested
p hyper 0.006

81/1131
**** no predictable genes for C0004134
82/1131
ignoring C0032300 as a predictor in network risk_gene
model for C0032300 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=362)
predicted 413.0/8107.0 genes associated with 52 training genes
predicted association rate 0.051
1/1 new associations could have been predicted and were tested
p hyper 1.000

83/1131
ignoring C0235874 as a predictor in network risk_gene
model for C0235874 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=1382)
predicted 1513.0/8107.0 genes associated with 151 training genes
predicted association rate 0.187
2/2 new associations coul

ignoring C0028754 as a predictor in network risk_gene
model for C0028754 achieved 0.00 validation rate at 0.14 validation recall (found 1/7 with pred=1114)
predicted 1247.0/8107.0 genes associated with 187 training genes
predicted association rate 0.154
7/7 new associations could have been predicted and were tested
p hyper 0.654

109/1131
ignoring C0043094 as a predictor in network risk_gene
model for C0043094 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=1879)
predicted 1939.0/8107.0 genes associated with 97 training genes
predicted association rate 0.239
1/1 new associations could have been predicted and were tested
p hyper 1.000

110/1131
ignoring C0005586 as a predictor in network risk_gene
model for C0005586 achieved 0.00 validation rate at 0.50 validation recall (found 5/10 with pred=2369)
predicted 2725.0/8107.0 genes associated with 422 training genes
predicted association rate 0.336
10/18 new associations could have been predicted and were tested

model for C0546264 achieved 0.04 validation rate at 1.00 validation recall (found 4/4 with pred=106)
predicted 118.0/8107.0 genes associated with 14 training genes
predicted association rate 0.015
4/4 new associations could have been predicted and were tested
p hyper 0.000

134/1131
ignoring C0014175 as a predictor in network risk_gene
model for C0014175 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=273)
predicted 424.0/8107.0 genes associated with 154 training genes
predicted association rate 0.052
2/2 new associations could have been predicted and were tested
p hyper 1.000

135/1131
ignoring C0026654 as a predictor in network risk_gene
model for C0026654 achieved 0.00 validation rate at 0.00 validation recall (found 0/4 with pred=15)
predicted 20.0/8107.0 genes associated with 11 training genes
predicted association rate 0.002
4/5 new associations could have been predicted and were tested
p hyper 1.000

136/1131
ignoring C0036421 as a predictor in netwo

ignoring C0007621 as a predictor in network risk_gene
model for C0007621 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=1252)
predicted 1343.0/8107.0 genes associated with 107 training genes
predicted association rate 0.166
1/3 new associations could have been predicted and were tested
p hyper 0.156

160/1131
ignoring C0014544 as a predictor in network risk_gene
model for C0014544 achieved 0.00 validation rate at 0.00 validation recall (found 0/4 with pred=614)
predicted 700.0/8107.0 genes associated with 102 training genes
predicted association rate 0.086
4/4 new associations could have been predicted and were tested
p hyper 1.000

161/1131
ignoring C0029456 as a predictor in network risk_gene
model for C0029456 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=544)
predicted 601.0/8107.0 genes associated with 61 training genes
predicted association rate 0.074
1/1 new associations could have been predicted and were tested
p hype

ignoring C0030567 as a predictor in network risk_gene
model for C0030567 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=825)
predicted 887.0/8107.0 genes associated with 80 training genes
predicted association rate 0.109
2/2 new associations could have been predicted and were tested
p hyper 1.000

188/1131
ignoring C0149925 as a predictor in network risk_gene
model for C0149925 achieved 0.00 validation rate at 0.00 validation recall (found 0/5 with pred=491)
predicted 513.0/8107.0 genes associated with 44 training genes
predicted association rate 0.063
5/9 new associations could have been predicted and were tested
p hyper 1.000

189/1131
ignoring C0011860 as a predictor in network risk_gene
model for C0011860 achieved 0.00 validation rate at 0.17 validation recall (found 1/6 with pred=1009)
predicted 1103.0/8107.0 genes associated with 145 training genes
predicted association rate 0.136
6/10 new associations could have been predicted and were tested
p hype

ignoring C0740391 as a predictor in network risk_gene
model for C0740391 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=748)
predicted 781.0/8107.0 genes associated with 33 training genes
predicted association rate 0.096
1/1 new associations could have been predicted and were tested
p hyper 1.000

214/1131
ignoring C0740392 as a predictor in network risk_gene
model for C0740392 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=748)
predicted 781.0/8107.0 genes associated with 33 training genes
predicted association rate 0.096
1/1 new associations could have been predicted and were tested
p hyper 1.000

215/1131
ignoring C0751845 as a predictor in network risk_gene
model for C0751845 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=748)
predicted 781.0/8107.0 genes associated with 33 training genes
predicted association rate 0.096
1/1 new associations could have been predicted and were tested
p hyper 1.

ignoring C0338831 as a predictor in network risk_gene
model for C0338831 achieved 0.01 validation rate at 0.19 validation recall (found 3/16 with pred=593)
predicted 643.0/8107.0 genes associated with 53 training genes
predicted association rate 0.079
16/25 new associations could have been predicted and were tested
p hyper 0.109

240/1131
ignoring C0543888 as a predictor in network risk_gene
model for C0543888 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=634)
predicted 657.0/8107.0 genes associated with 28 training genes
predicted association rate 0.081
1/2 new associations could have been predicted and were tested
p hyper 1.000

241/1131
ignoring C0019207 as a predictor in network risk_gene
model for C0019207 achieved 0.00 validation rate at 0.50 validation recall (found 1/2 with pred=1116)
predicted 1172.0/8107.0 genes associated with 61 training genes
predicted association rate 0.145
2/2 new associations could have been predicted and were tested
p hyp

ignoring C2936170 as a predictor in network risk_gene
model for C2936170 achieved 0.01 validation rate at 1.00 validation recall (found 1/1 with pred=108)
predicted 113.0/8107.0 genes associated with 5 training genes
predicted association rate 0.014
1/1 new associations could have been predicted and were tested
p hyper 0.013

266/1131
ignoring C2931498 as a predictor in network risk_gene
model for C2931498 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=16)
predicted 32.0/8107.0 genes associated with 29 training genes
predicted association rate 0.004
1/2 new associations could have been predicted and were tested
p hyper 1.000

267/1131
ignoring C3501611 as a predictor in network risk_gene
model for C3501611 achieved 0.03 validation rate at 1.00 validation recall (found 1/1 with pred=34)
predicted 44.0/8107.0 genes associated with 11 training genes
predicted association rate 0.005
1/1 new associations could have been predicted and were tested
p hyper 0.004



ignoring C2713443 as a predictor in network risk_gene
model for C2713443 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=879)
predicted 893.0/8107.0 genes associated with 14 training genes
predicted association rate 0.110
1/1 new associations could have been predicted and were tested
p hyper 0.109

291/1131
ignoring C0235974 as a predictor in network risk_gene
model for C0235974 achieved 0.01 validation rate at 0.67 validation recall (found 2/3 with pred=229)
predicted 244.0/8107.0 genes associated with 15 training genes
predicted association rate 0.030
3/4 new associations could have been predicted and were tested
p hyper 0.002

292/1131
ignoring C1720859 as a predictor in network risk_gene
model for C1720859 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
2/3 new associations could have been predicted and were tested
p hyper 1.000



ignoring C0410207 as a predictor in network risk_gene
model for C0410207 achieved 0.00 validation rate at 0.00 validation recall (found 0/7 with pred=0)
predicted 7.0/8107.0 genes associated with 7 training genes
predicted association rate 0.001
7/7 new associations could have been predicted and were tested
p hyper 1.000

323/1131
ignoring C0752282 as a predictor in network risk_gene
model for C0752282 achieved 0.00 validation rate at 0.00 validation recall (found 0/8 with pred=0)
predicted 5.0/8107.0 genes associated with 5 training genes
predicted association rate 0.001
8/8 new associations could have been predicted and were tested
p hyper 1.000

324/1131
ignoring C1834558 as a predictor in network risk_gene
model for C1834558 achieved 0.00 validation rate at 0.00 validation recall (found 0/9 with pred=0)
predicted 5.0/8107.0 genes associated with 5 training genes
predicted association rate 0.001
9/9 new associations could have been predicted and were tested
p hyper 1.000

325/1131
i

ignoring C0023434 as a predictor in network risk_gene
model for C0023434 achieved 0.00 validation rate at 0.25 validation recall (found 1/4 with pred=1516)
predicted 1553.0/8107.0 genes associated with 46 training genes
predicted association rate 0.192
4/4 new associations could have been predicted and were tested
p hyper 0.565

349/1131
**** no predictable genes for C0024291
350/1131
ignoring C1862939 as a predictor in network risk_gene
model for C1862939 achieved 0.03 validation rate at 0.50 validation recall (found 1/2 with pred=40)
predicted 83.0/8107.0 genes associated with 45 training genes
predicted association rate 0.010
2/2 new associations could have been predicted and were tested
p hyper 0.010

351/1131
ignoring C1862941 as a predictor in network risk_gene
model for C1862941 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=305)
predicted 349.0/8107.0 genes associated with 44 training genes
predicted association rate 0.043
1/1 new associations coul

ignoring C0041671 as a predictor in network risk_gene
model for C0041671 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=664)
predicted 682.0/8107.0 genes associated with 18 training genes
predicted association rate 0.084
2/2 new associations could have been predicted and were tested
p hyper 0.007

377/1131
ignoring C1321905 as a predictor in network risk_gene
model for C1321905 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=664)
predicted 682.0/8107.0 genes associated with 18 training genes
predicted association rate 0.084
2/2 new associations could have been predicted and were tested
p hyper 0.007

378/1131
**** no predictable genes for C3888239
379/1131
ignoring C0020305 as a predictor in network risk_gene
model for C0020305 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=10)
predicted 15.0/8107.0 genes associated with 13 training genes
predicted association rate 0.002
3/3 new associations could 

ignoring C0152423 as a predictor in network risk_gene
model for C0152423 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=285)
predicted 315.0/8107.0 genes associated with 36 training genes
predicted association rate 0.039
3/4 new associations could have been predicted and were tested
p hyper 1.000

403/1131
ignoring C0152427 as a predictor in network risk_gene
model for C0152427 achieved 0.09 validation rate at 0.83 validation recall (found 35/42 with pred=385)
predicted 444.0/8107.0 genes associated with 71 training genes
predicted association rate 0.055
42/43 new associations could have been predicted and were tested
p hyper 0.000

404/1131
ignoring C2936862 as a predictor in network risk_gene
model for C2936862 achieved 0.38 validation rate at 0.71 validation recall (found 5/7 with pred=13)
predicted 22.0/8107.0 genes associated with 9 training genes
predicted association rate 0.003
7/8 new associations could have been predicted and were tested
p hyper 0

ignoring C4551479 as a predictor in network risk_gene
model for C4551479 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 13.0/8107.0 genes associated with 13 training genes
predicted association rate 0.002
2/2 new associations could have been predicted and were tested
p hyper 1.000

428/1131
ignoring C4721532 as a predictor in network risk_gene
model for C4721532 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=104)
predicted 121.0/8107.0 genes associated with 19 training genes
predicted association rate 0.015
3/3 new associations could have been predicted and were tested
p hyper 1.000

429/1131
ignoring C0751651 as a predictor in network risk_gene
model for C0751651 achieved 0.02 validation rate at 1.00 validation recall (found 1/1 with pred=61)
predicted 106.0/8107.0 genes associated with 46 training genes
predicted association rate 0.013
1/3 new associations could have been predicted and were tested
p hyper 0.008


ignoring C1969342 as a predictor in network risk_gene
model for C1969342 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
3/3 new associations could have been predicted and were tested
p hyper 1.000

453/1131
ignoring C1969343 as a predictor in network risk_gene
model for C1969343 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
3/3 new associations could have been predicted and were tested
p hyper 1.000

454/1131
ignoring C3714844 as a predictor in network risk_gene
model for C3714844 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
3/3 new associations could have been predicted and were tested
p hyper 1.000

455/1131
i

ignoring C0543859 as a predictor in network risk_gene
model for C0543859 achieved 0.00 validation rate at 0.50 validation recall (found 1/2 with pred=1088)
predicted 1116.0/8107.0 genes associated with 28 training genes
predicted association rate 0.138
2/2 new associations could have been predicted and were tested
p hyper 0.251

481/1131
ignoring C0001890 as a predictor in network risk_gene
model for C0001890 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=13)
predicted 17.0/8107.0 genes associated with 4 training genes
predicted association rate 0.002
2/2 new associations could have been predicted and were tested
p hyper 1.000

482/1131
ignoring C0087012 as a predictor in network risk_gene
model for C0087012 achieved 0.00 validation rate at 0.00 validation recall (found 0/11 with pred=57)
predicted 74.0/8107.0 genes associated with 21 training genes
predicted association rate 0.009
11/12 new associations could have been predicted and were tested
p hyper 1.

ignoring C4552768 as a predictor in network risk_gene
model for C4552768 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

506/1131
ignoring C4553087 as a predictor in network risk_gene
model for C4553087 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

507/1131
ignoring C4553298 as a predictor in network risk_gene
model for C4553298 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

508/1131
i

ignoring C0027019 as a predictor in network risk_gene
model for C0027019 achieved 0.00 validation rate at 1.00 validation recall (found 4/4 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
4/4 new associations could have been predicted and were tested
p hyper 1.000

531/1131
ignoring C0085996 as a predictor in network risk_gene
model for C0085996 achieved 0.00 validation rate at 0.33 validation recall (found 1/3 with pred=357)
predicted 380.0/8107.0 genes associated with 23 training genes
predicted association rate 0.047
3/4 new associations could have been predicted and were tested
p hyper 0.127

532/1131
ignoring C0085997 as a predictor in network risk_gene
model for C0085997 achieved 0.00 validation rate at 0.33 validation recall (found 1/3 with pred=357)
predicted 380.0/8107.0 genes associated with 23 training genes
predicted association rate 0.047
3/4 new associations could have been predicted and were tested
p hyper 0

ignoring C1292778 as a predictor in network risk_gene
model for C1292778 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=16)
predicted 29.0/8107.0 genes associated with 16 training genes
predicted association rate 0.004
3/3 new associations could have been predicted and were tested
p hyper 1.000

559/1131
ignoring C0079487 as a predictor in network risk_gene
model for C0079487 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=16)
predicted 20.0/8107.0 genes associated with 6 training genes
predicted association rate 0.002
1/1 new associations could have been predicted and were tested
p hyper 1.000

560/1131
**** no predictable genes for C0744356
561/1131
ignoring C1842581 as a predictor in network risk_gene
model for C1842581 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=60)
predicted 64.0/8107.0 genes associated with 4 training genes
predicted association rate 0.008
1/2 new associations could have b

model for C0851578 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=18)
predicted 24.0/8107.0 genes associated with 6 training genes
predicted association rate 0.003
1/1 new associations could have been predicted and were tested
p hyper 1.000

585/1131
ignoring C4042891 as a predictor in network risk_gene
model for C4042891 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=18)
predicted 24.0/8107.0 genes associated with 6 training genes
predicted association rate 0.003
1/1 new associations could have been predicted and were tested
p hyper 1.000

586/1131
ignoring C2677613 as a predictor in network risk_gene
model for C2677613 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

587/1131
ignoring C0020672 as a predictor in network r

model for C0268731 achieved 0.12 validation rate at 0.08 validation recall (found 1/12 with pred=8)
predicted 16.0/8107.0 genes associated with 8 training genes
predicted association rate 0.002
12/15 new associations could have been predicted and were tested
p hyper 0.012

612/1131
ignoring C2931253 as a predictor in network risk_gene
model for C2931253 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

613/1131
ignoring C4746986 as a predictor in network risk_gene
model for C4746986 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

614/1131
ignoring C2930471 as a predictor in net

ignoring C0178416 as a predictor in network risk_gene
model for C0178416 achieved 0.00 validation rate at 0.00 validation recall (found 0/4 with pred=20)
predicted 23.0/8107.0 genes associated with 3 training genes
predicted association rate 0.003
4/4 new associations could have been predicted and were tested
p hyper 1.000

640/1131
ignoring C2931245 as a predictor in network risk_gene
model for C2931245 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

641/1131
ignoring C3808553 as a predictor in network risk_gene
model for C3808553 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

642/1131

ignoring C0233514 as a predictor in network risk_gene
model for C0233514 achieved 0.00 validation rate at 0.20 validation recall (found 1/5 with pred=403)
predicted 410.0/8107.0 genes associated with 8 training genes
predicted association rate 0.051
5/8 new associations could have been predicted and were tested
p hyper 0.225

666/1131
ignoring C0852413 as a predictor in network risk_gene
model for C0852413 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=60)
predicted 64.0/8107.0 genes associated with 4 training genes
predicted association rate 0.008
1/2 new associations could have been predicted and were tested
p hyper 1.000

667/1131
**** no predictable genes for C4021817
668/1131
ignoring C0265965 as a predictor in network risk_gene
model for C0265965 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=3)
predicted 16.0/8107.0 genes associated with 14 training genes
predicted association rate 0.002
1/1 new associations could have 

ignoring C0752199 as a predictor in network risk_gene
model for C0752199 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 6.0/8107.0 genes associated with 6 training genes
predicted association rate 0.001
1/1 new associations could have been predicted and were tested
p hyper 1.000

693/1131
ignoring C0752200 as a predictor in network risk_gene
model for C0752200 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 6.0/8107.0 genes associated with 6 training genes
predicted association rate 0.001
1/1 new associations could have been predicted and were tested
p hyper 1.000

694/1131
ignoring C0752201 as a predictor in network risk_gene
model for C0752201 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 6.0/8107.0 genes associated with 6 training genes
predicted association rate 0.001
1/1 new associations could have been predicted and were tested
p hyper 1.000

695/1131
i

ignoring C4551637 as a predictor in network risk_gene
model for C4551637 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=71)
predicted 78.0/8107.0 genes associated with 8 training genes
predicted association rate 0.010
1/1 new associations could have been predicted and were tested
p hyper 1.000

720/1131
ignoring C2750850 as a predictor in network risk_gene
model for C2750850 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=4)
predicted 7.0/8107.0 genes associated with 5 training genes
predicted association rate 0.001
1/1 new associations could have been predicted and were tested
p hyper 1.000

721/1131
ignoring C0007117 as a predictor in network risk_gene
model for C0007117 achieved 0.00 validation rate at 0.00 validation recall (found 0/7 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
7/7 new associations could have been predicted and were tested
p hyper 1.000

722/1131

ignoring C4048268 as a predictor in network risk_gene
model for C4048268 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
2/4 new associations could have been predicted and were tested
p hyper 1.000

745/1131
ignoring C0042063 as a predictor in network risk_gene
model for C0042063 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8103)
predicted 8107.0/8107.0 genes associated with 4 training genes
predicted association rate 1.000
2/3 new associations could have been predicted and were tested
p hyper 1.000

746/1131
ignoring C3489628 as a predictor in network risk_gene
model for C3489628 achieved 0.00 validation rate at 0.00 validation recall (found 0/3 with pred=28)
predicted 34.0/8107.0 genes associated with 6 training genes
predicted association rate 0.004
3/3 new associations could have been predicted and were tested
p hyper 1.0

ignoring C0158981 as a predictor in network risk_gene
model for C0158981 achieved 0.01 validation rate at 0.50 validation recall (found 1/2 with pred=144)
predicted 161.0/8107.0 genes associated with 21 training genes
predicted association rate 0.020
2/2 new associations could have been predicted and were tested
p hyper 0.035

774/1131
ignoring C1389016 as a predictor in network risk_gene
model for C1389016 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

775/1131
ignoring C1389018 as a predictor in network risk_gene
model for C1389018 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
2/2 new associations could have been predicted and were tested
p hyper 1.000


ignoring C1865869 as a predictor in network risk_gene
model for C1865869 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

800/1131
ignoring C3711645 as a predictor in network risk_gene
model for C3711645 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

801/1131
**** no predictable genes for C1568248
802/1131
ignoring C0014550 as a predictor in network risk_gene
model for C0014550 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 7.0/8107.0 genes associated with 8 training genes
predicted association rate 0.001
1/1 new associations could h

ignoring C1858558 as a predictor in network risk_gene
model for C1858558 achieved 0.50 validation rate at 1.00 validation recall (found 1/1 with pred=2)
predicted 4.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 0.000

829/1131
ignoring C0280313 as a predictor in network risk_gene
model for C0280313 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 3.0/8107.0 genes associated with 3 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

830/1131
ignoring C0585362 as a predictor in network risk_gene
model for C0585362 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 3.0/8107.0 genes associated with 3 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

831/1131
i

model for C0231686 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

856/1131
ignoring C0796117 as a predictor in network risk_gene
model for C0796117 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=1)
predicted 5.0/8107.0 genes associated with 4 training genes
predicted association rate 0.001
2/3 new associations could have been predicted and were tested
p hyper 1.000

857/1131
ignoring C1956097 as a predictor in network risk_gene
model for C1956097 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 4.0/8107.0 genes associated with 5 training genes
predicted association rate 0.000
2/3 new associations could have been predicted and were tested
p hyper 1.000

858/1131
ignoring C1853296 as a predictor in network risk_gene
m

ignoring C2316212 as a predictor in network risk_gene
model for C2316212 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

881/1131
ignoring C4551895 as a predictor in network risk_gene
model for C4551895 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 2.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

882/1131
ignoring C0232466 as a predictor in network risk_gene
model for C0232466 achieved 0.00 validation rate at 0.50 validation recall (found 2/4 with pred=564)
predicted 571.0/8107.0 genes associated with 7 training genes
predicted association rate 0.070
4/11 new associations could have been predicted and were tested
p hyper 0.026

883/1

ignoring C0018036 as a predictor in network risk_gene
model for C0018036 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

909/1131
ignoring C0036439 as a predictor in network risk_gene
model for C0036439 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8104)
predicted 8107.0/8107.0 genes associated with 3 training genes
predicted association rate 1.000
2/4 new associations could have been predicted and were tested
p hyper 1.000

910/1131
ignoring C2931779 as a predictor in network risk_gene
model for C2931779 achieved 0.00 validation rate at 1.00 validation recall (found 5/5 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
5/5 new associations could have been predicted and were tested
p hyper

ignoring C3540852 as a predictor in network risk_gene
model for C3540852 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

935/1131
**** no predictable genes for C1845151
936/1131
ignoring C2751643 as a predictor in network risk_gene
model for C2751643 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

937/1131
ignoring C4012727 as a predictor in network risk_gene
model for C4012727 achieved 0.50 validation rate at 1.00 validation recall (found 1/1 with pred=2)
predicted 4.0/8107.0 genes associated with 2 training genes
predicted association rate 0.000
1/1 new associations could h

ignoring C4551957 as a predictor in network risk_gene
model for C4551957 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

963/1131
ignoring C1847200 as a predictor in network risk_gene
model for C1847200 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

964/1131
ignoring C2751544 as a predictor in network risk_gene
model for C2751544 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper

ignoring C1847319 as a predictor in network risk_gene
model for C1847319 achieved 0.00 validation rate at 0.00 validation recall (found 0/2 with pred=0)
predicted 3.0/8107.0 genes associated with 3 training genes
predicted association rate 0.000
2/3 new associations could have been predicted and were tested
p hyper 1.000

990/1131
ignoring C1861848 as a predictor in network risk_gene
model for C1861848 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

991/1131
ignoring C1864827 as a predictor in network risk_gene
model for C1864827 achieved 0.08 validation rate at 1.00 validation recall (found 1/1 with pred=12)
predicted 14.0/8107.0 genes associated with 2 training genes
predicted association rate 0.002
1/1 new associations could have been predicted and were tested
p hyper 0.001

99

ignoring C1838244 as a predictor in network risk_gene
model for C1838244 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

1019/1131
ignoring C1720958 as a predictor in network risk_gene
model for C1720958 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
2/2 new associations could have been predicted and were tested
p hyper 1.000

1020/1131
ignoring C1852372 as a predictor in network risk_gene
model for C1852372 achieved 0.67 validation rate at 0.40 validation recall (found 2/5 with pred=3)
predicted 5.0/8107.0 genes associated with 3 training genes
predicted association rate 0.001
5/5 new associations could have been predicted and were tested
p hyper 0.0

ignoring C0751602 as a predictor in network risk_gene
model for C0751602 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=0)
predicted 4.0/8107.0 genes associated with 4 training genes
predicted association rate 0.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

1049/1131
ignoring C0271934 as a predictor in network risk_gene
model for C0271934 achieved 0.00 validation rate at 1.00 validation recall (found 2/2 with pred=8105)
predicted 8107.0/8107.0 genes associated with 2 training genes
predicted association rate 1.000
2/3 new associations could have been predicted and were tested
p hyper 1.000

1050/1131
ignoring C0266617 as a predictor in network risk_gene
model for C0266617 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.0

ignoring C1865872 as a predictor in network risk_gene
model for C1865872 achieved 0.00 validation rate at 0.00 validation recall (found 0/1 with pred=44)
predicted 50.0/8107.0 genes associated with 7 training genes
predicted association rate 0.006
1/1 new associations could have been predicted and were tested
p hyper 1.000

1078/1131
ignoring C0265215 as a predictor in network risk_gene
model for C0265215 achieved 0.05 validation rate at 1.00 validation recall (found 2/2 with pred=42)
predicted 54.0/8107.0 genes associated with 12 training genes
predicted association rate 0.007
2/2 new associations could have been predicted and were tested
p hyper 0.000

1079/1131
ignoring C3714506 as a predictor in network risk_gene
model for C3714506 achieved 0.01 validation rate at 0.33 validation recall (found 1/3 with pred=98)
predicted 112.0/8107.0 genes associated with 14 training genes
predicted association rate 0.014
3/3 new associations could have been predicted and were tested
p hyper 0.036


ignoring C2931208 as a predictor in network risk_gene
model for C2931208 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

1105/1131
ignoring C1859353 as a predictor in network risk_gene
model for C1859353 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyper 1.000

1106/1131
ignoring C1861355 as a predictor in network risk_gene
model for C1861355 achieved 0.00 validation rate at 1.00 validation recall (found 1/1 with pred=8106)
predicted 8107.0/8107.0 genes associated with 1 training genes
predicted association rate 1.000
1/1 new associations could have been predicted and were tested
p hyp

In [69]:
df = pd.DataFrame(rows)

In [70]:
df.head()

Unnamed: 0,target,n_new_genes_predictable,n_new_genes_total,F1,F05,F2,ACC,J,PREC,REC,...,fp,fn,weight_risk_gene,weight_go_annotation,weight_encoded_PPI,validation_precision,validation_recall,n_genes_total,n_validated,prob
0,C0036341,0,1,,,,,,,,...,,,,,,,,,,
1,C0002395,1,1,0.137008,0.090324,0.283572,0.864808,0.852086,0.073604,0.988636,...,1095.0,1.0,0.1,0.0,0.0,0.000913,1.0,8107.0,1.0,0.136551
2,C0007102,3,5,0.16045,0.108716,0.306122,0.852843,0.629782,0.089482,0.77551,...,1160.0,33.0,0.4,0.0,0.6,0.000862,0.333333,8107.0,1.0,0.37661
3,C0009375,4,5,0.166002,0.11324,0.310819,0.871099,0.61621,0.093441,0.742857,...,1009.0,36.0,0.8,0.0,0.9,0.001982,0.5,8107.0,2.0,0.080713
4,C0011265,3,4,0.132597,0.087209,0.276498,0.864438,0.863019,0.071006,1.0,...,1099.0,0.0,0.1,0.0,0.0,0.00273,1.0,8107.0,3.0,0.002564


In [71]:
df.to_csv("temporal_validation_gs_nra.csv") #graph_sparse, not require all