-
Notifications
You must be signed in to change notification settings - Fork 1
/
CV2K_main.py
305 lines (262 loc) · 11.8 KB
/
CV2K_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import sys
import collections
from multiprocessing import Pool
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import argparse
import scipy.stats
import scipy.optimize.nnls
from CV2K_cv import *
def rollback(errors):
"""
Finds K after rolling backwards from the k with minimum median value, using Wilcoxon rank-sum test
:param errors: numpy array of errors (Ks x repetitions)
:return: index of K after rollback
"""
best_k_arg = np.argmin(np.nanmedian(errors, axis=1))
# rollback
for k in range(best_k_arg):
best_k_runs = errors[best_k_arg]
next_k_runs = errors[k]
u, p = scipy.stats.ranksums(next_k_runs, best_k_runs)
if p > 0.05:
best_k_arg = k
break
return best_k_arg
def NMF(V, k):
"""
V ~ WH using CV2K_cv.py methods
:param V: input matrix (n x m)
:param k: rank of factorization
:return: W (n x k) and H (k x m) matrices
"""
W, H = init_factor_matrices(V, k, O=np.ones((V.shape), dtype=bool), eps=1e-6, obj='euc', reg=0)
W, H, _ = optimize(V, W, H, O=np.ones((V.shape), dtype=bool), maxiter=args.maxiter, eps=1e-6, obj='euc', reg=0)
return W, H
def nnls(V, H):
"""
Given V and H, computes W matrix: V ~ WH using SciPy's NNLS
:param V: input matrix (n x m)
:param H: signature matrix (k x m)
:return: exposure matrix (n x k)
"""
n = V.shape[0]
W = []
for j in range(n):
W.append(scipy.optimize.nnls(H.T, V[j])[0])
return np.array(W)
def CV2K_x_compute_error(V, k, sample_fold, category_folds):
"""
CV2K_x helper - computes overall error for a given k and sample_fold, by accumulating all category_fold errors.
using parameters from main.
:param V: input matrix (n x m)
:param k: rank of factorization
:param sample_fold: index of group of samples that correspond to the current fold
:param category_folds: total number of category folds
:return: float - total error for the given sample fold
"""
np.random.seed()
errors = 0
# splitting samples to train and test
train_samples_idx = np.concatenate([indices for i, indices in enumerate(fold_sample_indices) if i != sample_fold])
test_samples_idx = fold_sample_indices[sample_fold]
V_train = V[train_samples_idx]
V_test = V[test_samples_idx]
# producing H matrix from V using NMF
_, H = NMF(V_train, k)
# normalizing H
H /= H.sum(1, keepdims=True)
for category_fold in range(category_folds):
# splitting categories to train and test
train_categories_idx = np.concatenate([indices for i, indices in enumerate(fold_category_indices) if
i != category_fold])
test_categories_idx = fold_category_indices[category_fold]
V_for_nnls = V_test.T[train_categories_idx].T
# creating normalizing H for NNLS
H_for_nnls = H.T[train_categories_idx].T.copy()
H_for_nnls /= H_for_nnls.sum(1, keepdims=True)
# getting final W using NNLS
final_W = nnls(V_for_nnls, H_for_nnls)
final_W = final_W / final_W.sum(1, keepdims=True)
# getting final V, H
final_H = H.T[test_categories_idx].T
final_V_from_WH = final_W.dot(final_H)
normalized_V_test = V_test / V_test.sum(1, keepdims=True)
final_V = normalized_V_test.T[test_categories_idx].T
# compute error and add to overall
errors += np.sum(np.abs(final_V_from_WH - final_V)) / final_V_from_WH.size
print("k = %d, fold: %d - completed." % (k, sample_fold))
return errors
def CV2K_x():
"""
runs CV2K_x method. using parameters from main.
:return: numpy array - errors for each k and sample fold - error matrix (Ks x sample_folds)
"""
# run method with pool
with Pool(args.workers) as pool:
res = [pool.apply_async(CV2K_x_compute_error, (V, k, sample_fold, category_folds)) for
k in Ks for sample_fold in range(sample_folds)]
pool.close()
pool.join()
# get results from pool
res = [x.get() for x in res]
errors = np.reshape([x for x in res], (len(Ks), sample_folds))
return errors
def CV2K_standard():
"""
runs CV2K standard method. using parameters from main.
:return: numpy array - errors for each k and repetition - error matrix (Ks x repetitions)
"""
rank_cycle = np.array([[i] * args.reps for i in Ks]).flatten()
# create pool and run nmf
with Pool(args.workers) as pool:
res = [pool.apply_async(crossval_nmf, (V, rank, args.maxiter, eps, args.obj, args.reg, args.fraction))
for rank in rank_cycle]
pool.close()
pool.join()
# get results from pool
res = [x.get() for x in res]
Ws, Hs = [x[0] for x in res], [x[1] for x in res] # ignoring
errors = np.reshape(np.array([x[2] for x in res]), (len(np.unique(rank_cycle)), args.reps))
return errors
def CV2K_auto_binary_search(variant='standard'):
Ks_and_errors = {}
global Ks
Ks = [5, 10, 20] # predetermined starting Ks
low_bound, up_bound = 1, np.min([80, n, m])
stop = False
flag = 0
while not stop:
if flag == 1: stop = True
if variant == 'standard':
errors = CV2K_standard()
else: # variant == 'x'
errors = CV2K_x()
# store errors in dictionary and sort it by key
for i in range(errors.shape[0]):
Ks_and_errors[Ks[i]] = errors[i]
Ks_and_errors = collections.OrderedDict(sorted(Ks_and_errors.items()))
all_errors = np.vstack(list(Ks_and_errors.values()))
# load next Ks
sorted_visited_Ks = np.array(list(Ks_and_errors.keys()))
best_k_idx = rollback(all_errors)
best_k = sorted_visited_Ks[best_k_idx]
best_k_global_arg = np.nonzero(sorted_visited_Ks == best_k)[0][0]
if best_k_global_arg == 0: new_Ks = [max(low_bound, best_k//2), best_k + (sorted_visited_Ks[1]-best_k)//2]
elif best_k_global_arg == len(sorted_visited_Ks)-1: new_Ks = [best_k - (best_k-sorted_visited_Ks[-2])//2,
min(best_k*2, up_bound)]
else: new_Ks = [best_k - (best_k-sorted_visited_Ks[best_k_global_arg-1])//2,
best_k + (sorted_visited_Ks[best_k_global_arg+1]-best_k)//2]
Ks = np.setdiff1d(new_Ks, sorted_visited_Ks)
if len(Ks) == 0:
flag = 1 # indicates final loop -- 7 size window [k-3, k+3]
Ks = np.setdiff1d(np.arange(max(low_bound, best_k-3), min(best_k+4, up_bound+1)), sorted_visited_Ks)
if len(Ks) == 0: stop = True
# update global Ks and final errors
Ks = np.array(list(Ks_and_errors.keys()))
final_errors = np.vstack(list(Ks_and_errors.values()))
return final_errors
def produce_figure(rollback=True):
"""
produces a figure that shows the results
:param rollback: int - chosen k, after rolling backwards from the k with minimum median error
"""
print("Tested Ks: ", Ks, flush=True)
medians = np.nanmedian(errors, axis=1)
print("Median values: ", np.log(medians), flush=True)
print("Best K by median value: ", best_k, flush=True)
title_str = args.data + "\n%dx%d, reps/folds: %d" % (n, m, args.reps)
plt.title(title_str)
plt.xlabel('Rank')
plt.ylabel('log(Imputation Error)')
rank_cycle = np.array([[i] * args.reps for i in Ks]).flatten()
plt.plot(rank_cycle + randn(rank_cycle.size) * .1, np.log(errors.flatten()), '.k', markersize=1) # scatter
plt.plot(Ks, np.log(medians), 'b--', zorder=-1, label='Validation median (best: %d)' % best_k)
plt.legend(loc='best')
plt.xticks(Ks.astype(int))
if rollback:
plt.axvspan(best_k_after_rollback - 0.05, best_k_after_rollback + 0.05, color='gray', alpha=0.5)
plt.savefig(run_dir + '/figure.pdf')
plt.clf()
if __name__ == '__main__':
# parse script parameters
parser = argparse.ArgumentParser(description='CV2K')
parser.add_argument('--version', type=str, default='standard', choices=['standard', 'x'], help='standard / x')
parser.add_argument('--auto', type=str, default='n', choices=['y', 'n'], help='automatic search vs given range')
parser.add_argument('--data', type=str, help='file_name.npy - n x m catalog: n rows are samples, '
'm columns are mutation types')
parser.add_argument('--fraction', type=float, default=0.1, help='validation fraction')
parser.add_argument('--reps', type=int, default=30, help='number of repeats per rank')
parser.add_argument('--maxiter', type=int, default=2000, help='max number of iterations')
parser.add_argument('--bottom_k', type=int, default=1, help='lower boundary')
parser.add_argument('--top_k', type=int, default=10, help='upper boundary')
parser.add_argument('--stride', type=int, default=1, help='stride')
parser.add_argument('--workers', type=int, default=20, help='number of workers')
parser.add_argument('--obj', type=str, default='kl', choices=['kl', 'euc', 'is'],
help='euclidean vs kl-divergence vs IS')
parser.add_argument('--reg', type=float, default=0, help='regularization rate')
args = parser.parse_args()
V = np.load(args.data)
n, m = V.shape
# define convergence criterion
eps = 1e-6
# create rank cycle
Ks = np.arange(args.bottom_k, args.top_k+1, args.stride)
start = time.time()
timestamp = datetime.datetime.now().strftime('%d-%m_%H-%M-%S')
# create output directory
run_dir = args.data + "_%dx%d_reps-%d_obj-%s_frac-%.4f-reg-%.2f_" % (n, m, args.reps, args.obj, args.fraction,
args.reg) + timestamp
if not os.path.exists(run_dir):
os.makedirs(run_dir)
# main procedure
if args.version == 'standard':
if args.auto == 'n':
errors = CV2K_standard()
else:
errors = CV2K_auto_binary_search(args.version)
else: # args.version == 'x'
# define default number of category and sample folds
category_folds = V.shape[1] # m category folds
sample_folds = 10
# shuffling indices - shouldn't be necessary most of the time
indices = np.arange(n)
np.random.shuffle(indices)
V = V[indices].copy()
indices = np.arange(m)
np.random.shuffle(indices)
V = V.T[indices].T.copy()
# splitting indices
indices = np.arange(n)
fold_sample_indices = np.array_split(indices, sample_folds)
indices = np.arange(m)
fold_category_indices = np.split(indices, category_folds)
# get errors using our method
if args.auto == 'n':
errors = CV2K_x()
else:
errors = CV2K_auto_binary_search(args.version)
end = time.time()
print("Completed!")
# save errors to output directory
np.save(run_dir + "/errors.npy", errors)
# redirect print to output file
orig_stdout = sys.stdout
f = open(run_dir + '/output.txt', 'w')
sys.stdout = f
# print parameters and runtime to output file
print(args, flush=True)
print("runtime: " + str(datetime.timedelta(seconds=end - start)))
best_k_arg = np.argmin(np.nanmedian(errors, axis=1))
best_k = Ks[best_k_arg]
# rollback
best_k_after_rollback_idx = rollback(errors)
best_k_after_rollback = Ks[best_k_after_rollback_idx]
# results figure
produce_figure(rollback=True)
print("Rollback from median: %d\n" % best_k_after_rollback)
sys.stdout = orig_stdout
f.close()