-
Notifications
You must be signed in to change notification settings - Fork 27
/
cat_pval.py
98 lines (65 loc) · 2.38 KB
/
cat_pval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import pandas as pd
from copy import deepcopy
def main(net):
'''
calculate pvalue of category closeness
'''
# calculate the distance between the data points within the same category and
# compare to null distribution
for inst_rc in ['row', 'col']:
inst_nodes = deepcopy(net.dat['nodes'][inst_rc])
inst_index = deepcopy(net.dat['node_info'][inst_rc]['clust'])
# reorder based on clustered order
inst_nodes = [ inst_nodes[i] for i in inst_index]
# make distance matrix dataframe
dm = dist_matrix_lattice(inst_nodes)
node_infos = list(net.dat['node_info'][inst_rc].keys())
all_cats = []
for inst_info in node_infos:
if 'dict_cat_' in inst_info:
all_cats.append(inst_info)
for cat_dict in all_cats:
tmp_dict = net.dat['node_info'][inst_rc][cat_dict]
pval_name = cat_dict.replace('dict_','pval_')
net.dat['node_info'][inst_rc][pval_name] = {}
for cat_name in tmp_dict:
subset = tmp_dict[cat_name]
inst_median = calc_median_dist_subset(dm, subset)
hist = calc_hist_distances(dm, subset, inst_nodes)
pval = 0
for i in range(len(hist['prob'])):
if i == 0:
pval = hist['prob'][i]
if i >= 1:
if inst_median >= hist['bins'][i]:
pval = pval + hist['prob'][i]
net.dat['node_info'][inst_rc][pval_name][cat_name] = pval
def dist_matrix_lattice(names):
from scipy.spatial.distance import pdist, squareform
lattice_size = len(names)
mat = np.zeros([lattice_size, 1])
mat[:,0] = list(range(lattice_size))
inst_dm = pdist(mat, metric='euclidean')
inst_dm[inst_dm < 0] = float(0)
inst_dm = squareform(inst_dm)
df = pd.DataFrame(data=inst_dm, columns=names, index=names)
return df
def calc_median_dist_subset(dm, subset):
return np.median(dm[subset].ix[subset].values)
def calc_hist_distances(dm, subset, inst_nodes):
np.random.seed(100)
num_null = 1000
num_points = len(subset)
median_dist = []
for i in range(num_null):
tmp = np.random.choice(inst_nodes, num_points, replace=False)
median_dist.append( np.median(dm[tmp].ix[tmp].values) )
tmp_dist = sorted(deepcopy(median_dist))
median_dist = np.asarray(median_dist)
s1 = pd.Series(median_dist)
hist = np.histogram(s1, bins=30)
H = {}
H['prob'] = hist[0]/np.float(num_null)
H['bins'] = hist[1]
return H