-
Notifications
You must be signed in to change notification settings - Fork 0
/
rankpruning.py
718 lines (558 loc) · 25.4 KB
/
rankpruning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
# coding: utf-8
# In[ ]:
from __future__ import print_function
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.model_selection import StratifiedKFold
import numpy as np
import math
# In[ ]:
MIN_NUM_PER_CLASS = 10
# In[ ]:
# Relevant helper functions exposed to rankpruning module.
def assert_inputs_are_valid(X, s, prob_s_eq_1 = None):
'''Checks that X, s, and prob_s_eq_1
are correctly formatted'''
if prob_s_eq_1 is not None:
if not isinstance(prob_s_eq_1, (np.ndarray, np.generic)):
raise TypeError("prob_s_eq_1 should be a numpy array.")
if len(prob_s_eq_1) != len(s):
raise ValueError("prob_s_eq_1 and s must have same length.")
# Check for valid probablities.
for i in prob_s_eq_1:
if i < 0 or i > 1:
raise ValueError("Values in prob_s_eq_1 must be between 0 and 1.")
if not isinstance(s, (np.ndarray, np.generic)):
raise TypeError("s should be a numpy array.")
if not isinstance(X, (np.ndarray, np.generic)):
raise TypeError("X should be a numpy array.")
for i in s:
if i < 0 or i > 1 or (i > 0 and i < 1):
raise ValueError("s should only contain 0 or 1 values.")
def compute_conf_counts_noise_rates_from_probabilities(
s,
prob_s_eq_1,
positive_lb_threshold = None,
negative_ub_threshold = None,
verbose = False,
):
'''Function to compute the rho hat (rh) confident counts
estimate of the noise rates from prob_s_eq_1 and s.
Important! This function assumes that prob_s_eq_1 are out-of-sample
holdout probabilities. This can be done with cross validation. If
the probabilities are not computed out-of-sample, overfitting may occur.
This function estimates rh1 (the fraction of pos examples mislabeled
as neg, frac_pos2neg) and rh0 (the fraction of neg examples
mislabeled as pos, frac_neg2pos).
The acronym 'rh' stands for rho hat, where rho is a greek symbol for
noise rate and hat tells us that the value is estimated, not necessarily
exact. Under certain conditions, estimates are exact, and in most
conditions, estimates are within one percent of the actual noise rates.
Parameters
----------
s : np.array
A binary vector of labels, s, which may contain mislabeling
prob_s_eq_1 : iterable (list or np.array)
The probability, for each example, whether it is s==1 P(s==1|x).
If you are not sure, leave prob_s_eq_q = None (default) and
it will be computed for you using cross-validation.
positive_lb_threshold : float
P(s^=1|s=1). If an example has a predicted probability "greater" than
this threshold, it is counted as having hidden label y = 1. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
negative_ub_threshold : float
P(s^=1|s=0). If an example has a predicted probability "lower" than
this threshold, it is counted as having hidden label y = 0. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
verbose : bool
Set to true if you wish to print additional information while running.
'''
# Estimate the probability thresholds for confident counting
if positive_lb_threshold is None:
positive_lb_threshold = np.mean(prob_s_eq_1[s == 1]) # P(s^=1|s=1)
if negative_ub_threshold is None:
negative_ub_threshold = np.mean(prob_s_eq_1[s == 0]) # P(s^=1|s=0)
# Estimate the number of confident examples having s = 0 and y = 1
N_most_positive_size = sum((prob_s_eq_1 >= positive_lb_threshold) & (s == 0))
# Estimate the number of confident examples having s = 1 and y = 1
P_most_positive_size = sum((prob_s_eq_1 >= positive_lb_threshold) & (s == 1))
# Estimate the number of confident examples having s = 0 and y = 0
N_least_positive_size = sum((prob_s_eq_1 <= negative_ub_threshold) & (s == 0))
# Estimate the number of confident examples having s = 1 and y = 0
P_least_positive_size = sum((prob_s_eq_1 <= negative_ub_threshold) & (s == 1))
if verbose:
print("N_most_positive_size", N_most_positive_size)
print("P_most_positive_size", P_most_positive_size)
print("N_least_positive_size", N_least_positive_size)
print("P_least_positive_size", P_least_positive_size)
# Confident Counts Estimator for p(s=0|y=1) ~ |s=0 and y=1| / |y=1|
# Allow np.NaN when float(N_most_positive_size + P_most_positive_size) == 0
rh1_conf = N_most_positive_size / float(N_most_positive_size + P_most_positive_size)
# Confident Counts Estimator for p(s=1|y=0) ~ |s=1 and y=0| / |y=0|
# Allow np.NaN when float(N_least_positive_size + P_least_positive_size) == 0
rh0_conf = P_least_positive_size / float(N_least_positive_size + P_least_positive_size)
# Ensure that rh1, rh0 are in proper range [0,1)
rh0_conf = min(max(rh0_conf, 0.0), 0.9999)
rh1_conf = min(max(rh1_conf, 0.0), 0.9999)
if verbose:
print("Est count of s = 1 and y = 1:", P_most_positive_size)
print("Est count of s = 0 and y = 1:", N_most_positive_size)
print("Est count of s = 1 and y = 0:", P_least_positive_size)
print("Est count of s = 0 and y = 0:", N_least_positive_size)
print("rh1_conf:", rh1_conf)
print("rh0_conf:", rh0_conf)
return rh1_conf, rh0_conf
def compute_noise_rates_and_cv_pred_proba(
X,
s,
clf = logreg(),
cv_n_folds = 3,
positive_lb_threshold = None,
negative_ub_threshold = None,
verbose = False,
):
'''This function computes the out-of-sample predicted
probability P(s=k|x) for every example x in X using cross
validation while also computing the confident counts noise
rates within each cross-validated subset and returning
the average noise rate across all examples.
This function estimates rh1 (the fraction of pos examples mislabeled
as neg, frac_pos2neg) and rh0 (the fraction of neg examples
mislabeled as pos, frac_neg2pos).
The acronym 'rh' stands for rho hat, where rho is a greek symbol for
noise rate and hat tells us that the value is estimated, not necessarily
exact. Under certain conditions, estimates are exact, and in most
conditions, estimates are within one percent of the actual noise rates.
Parameters
----------
X : np.array
Input feature matrix (N, D), 2D numpy array
s : np.array
A binary vector of labels, s, which may contain mislabeling
clf : sklearn.classifier or equivalent
Default classifier used is logistic regression. Assumes clf
has predict_proba() and fit() defined.
cv_n_folds : int
The number of cross-validation folds used to compute
out-of-sample probabilities for each example in X.
positive_lb_threshold : float
P(s^=1|s=1). If an example has a predicted probability "greater" than
this threshold, it is counted as having hidden label y = 1. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
negative_ub_threshold : float
P(s^=1|s=0). If an example has a predicted probability "lower" than
this threshold, it is counted as having hidden label y = 0. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
verbose : bool
Set to true if you wish to print additional information while running.
'''
# Number of classes
K = len(np.unique(s))
# Number of training examples
N = len(s)
# Create cross-validation object for out-of-sample predicted probabilities.
# CV folds preserve the fraction of noisy positive and
# noisy negative examples in each class.
kf = StratifiedKFold(n_splits = cv_n_folds, shuffle = True)
# Intialize result storage and final prob_s array
rh1_per_cv_fold = []
rh0_per_cv_fold = []
prob_s = np.zeros((N, K))
# Split X and s into "cv_n_folds" stratified folds.
for k, (cv_train_idx, cv_holdout_idx) in enumerate(kf.split(X, s)):
# Select the training and holdout cross-validated sets.
X_train_cv, X_holdout_cv = X[cv_train_idx], X[cv_holdout_idx]
s_train_cv, s_holdout_cv = s[cv_train_idx], s[cv_holdout_idx]
# Fit the clf classifier to the training set and
# predict on the holdout set and update prob_s.
clf.fit(X_train_cv, s_train_cv)
prob_s_cv = clf.predict_proba(X_holdout_cv) # P(s = k|x) # [:,1]
prob_s[cv_holdout_idx] = prob_s_cv
# Compute and append the confident counts noise estimators
# to estimate the positive and negative mislabeling rates.
rh1_cv, rh0_cv = compute_conf_counts_noise_rates_from_probabilities(
s = s_holdout_cv,
prob_s_eq_1 = prob_s_cv[:,1], # P(s = 1|x)
positive_lb_threshold = positive_lb_threshold,
negative_ub_threshold = negative_ub_threshold,
verbose = verbose,
)
rh1_per_cv_fold.append(rh1_cv)
rh0_per_cv_fold.append(rh0_cv)
# Return mean rh, omitting nan or inf values, and prob_s
return (
_mean_without_nan_inf(rh1_per_cv_fold),
_mean_without_nan_inf(rh0_per_cv_fold),
prob_s,
)
def compute_cv_predicted_probabilities(
X,
y, # labels, can be noisy (s) or not noisy (y).
clf = logreg(),
cv_n_folds = 3,
verbose = False,
):
'''This function computes the out-of-sample predicted
probability [P(s=k|x)] for every example in X using cross
validation. Output is a np.array of shape (N, K) where N is
the number of training examples and K is the number of classes.
Parameters
----------
X : np.array
Input feature matrix (N, D), 2D numpy array
y : np.array
A binary vector of labels, y, which may or may not contain mislabeling
clf : sklearn.classifier or equivalent
Default classifier used is logistic regression. Assumes clf
has predict_proba() and fit() defined.
cv_n_folds : int
The number of cross-validation folds used to compute
out-of-sample probabilities for each example in X.
verbose : bool
Set to true if you wish to print additional information while running.
'''
return compute_noise_rates_and_cv_pred_proba(
X = X,
s = y,
clf = clf,
cv_n_folds = cv_n_folds,
verbose = verbose,
)[-1]
def compute_conf_counts_noise_rates(
X,
s,
clf = logreg(),
cv_n_folds = 3,
positive_lb_threshold = None,
negative_ub_threshold = None,
verbose = False,
):
'''Computes the rho hat (rh) confident counts estimate of the
noise rates from X and s.
This function estimates rh1 (the fraction of pos examples mislabeled
as neg, frac_pos2neg) and rh0 (the fraction of neg examples
mislabeled as pos, frac_neg2pos).
The acronym 'rh' stands for rho hat, where rho is a greek symbol for
noise rate and hat tells us that the value is estimated, not necessarily
exact. Under certain conditions, estimates are exact, and in most
conditions, estimates are within one percent of the actual noise rates.
Parameters
----------
X : np.array
Input feature matrix (N, D), 2D numpy array
s : np.array
A binary vector of labels, s, which may contain mislabeling
clf : sklearn.classifier or equivalent
Default classifier used is logistic regression. Assumes clf
has predict_proba() and fit() defined.
cv_n_folds : int
The number of cross-validation folds used to compute
out-of-sample probabilities for each example in X.
positive_lb_threshold : float
P(s^=1|s=1). If an example has a predicted probability "greater" than
this threshold, it is counted as having hidden label y = 1. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
negative_ub_threshold : float
P(s^=1|s=0). If an example has a predicted probability "lower" than
this threshold, it is counted as having hidden label y = 0. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
verbose : bool
Set to true if you wish to print additional information while running.
'''
return compute_noise_rates_and_cv_pred_proba(
X = X,
s = s,
clf = clf,
cv_n_folds = cv_n_folds,
positive_lb_threshold = positive_lb_threshold,
negative_ub_threshold = negative_ub_threshold,
verbose = verbose,
)[:-1]
def compute_ps1_py1_pi1_pi0(s, rh1, rh0):
'''Compute ps1 := P(s=1), py1 := P(y=1), and inverse noise rates pi1, pi0.
Parameters
----------
s : np.array
A binary vector of labels, s, which may contain mislabeling
rh1 : float
P(s=0|y=1). Fraction of positive examples mislabeled as negative examples.
rh1 = frac_pos2neg.
rh0 : float
P(s=1|y=0). Fraction of negative examples mislabeled as positive examples.
rh0 = frac_neg2pos.
'''
# Compute ps1 := P(s=1), py1 := P(y=1), and inverse noise rates pi1, pi0
ps1 = sum(s) / float(len(s))
py1 = (ps1 - rh0) / float(1 - rh1 - rh0)
pi1 = rh0 * (1 - py1) / float(ps1)
pi0 = (rh1 * py1) / float(1 - ps1)
# # Equivalently, we can compute pi1, pi0, and py1 this way as well, but there is no need:
# pi1 = rh0 * (1 - ps1 - rh1) / float(ps1) / float(1 - rh1 - rh0)
# pi0 = rh1 * (ps1 - rh0) / float(1 - ps1) / float(1 - rh1 - rh0)
# py1 = ps1 * (1 - pi1) + pi0 * (1 - ps1)
# Ensure that pi1, and pi0 are in proper range [0,1)
pi1 = min(max(pi1, 0.0), 0.9999)
pi0 = min(max(pi0, 0.0), 0.9999)
return ps1, py1, pi1, pi0
def get_noise_indices(
s,
prob_s_eq_1,
frac_of_noise = 1.0,
pi1 = None,
pi0 = None,
num_to_remove_per_class = None,
verbose = False,
):
'''Returns the indices of most likely (confident) label errors in s. The
number of indices returned is specified by frac_of_noise. When
frac_of_noise = 1.0, all "confidently" estimated noise indices are returned.
Parameters
----------
s : np.array
A binary vector of labels, s, which may contain mislabeling
prob_s_eq_1 : iterable (list or np.array)
The probability, for each example, whether it is s==1 P(s==1|x).
If you are not sure, leave prob_s_eq_q = None (default) and
it will be computed for you using cross-validation.
frac_of_noise : float
When frac_of_noise = 1.0, return all "confidently" estimated noise indices.
Value in range (0, 1] that determines the fraction of noisy example
indices to return based on the following formula for example class k.
frac_of_noise * number_of_mislabeled_examples_in_class_k, or equivalently
frac_of_noise * inverse_noise_rate_class_k * num_examples_with_s_equal_k
pi1 : float
P(y=0|s=1) Fraction of observed positive examples that are mislabeled. If None,
pi1 will be computed from prob_s_eq_1 and s.
pi0 : float
P(y=1|s=0). Fraction of observed negative examples that are mislabeled. If None,
pi0 will be computed from prob_s_eq_1 and s.
num_to_remove_per_class : list of int of length K (# of classes)
e.g. num_to_remove_per_class = [5, 10] would return the indices of the 5 most
likely mislabeled examples in class s = 0, and the 10 most likely mislabeled
examples in class s = 1. List must be integers and be of length K (the number
of classes).
class_label : int (non-negative)
If set to 0 or 1, only return noise indicies for that class_label. By
default this is set to None, which returns a list of indices of noise
for each class.
verbose : bool
Set to true if you wish to print additional information while running.
'''
size_P_noisy = sum(s == 1)
size_N_noisy = sum(s == 0)
if pi1 is None or pi0 is None:
rh1, rh0 = compute_conf_counts_noise_rates_from_probabilities(s, prob_s_eq_1)
_, _, pi1, pi0 = compute_ps1_py1_pi1_pi0(s, rh1, rh0)
if num_to_remove_per_class is None:
# Estimate k0 and k1 (number of non-confident examples to prune)
# When frac_of_noise = 1, k1 and k0 are the number of expected mislabeling errors.
k1 = size_P_noisy * pi1 * frac_of_noise
k0 = size_N_noisy * pi0 * frac_of_noise
else:
k1 = num_to_remove_per_class[1]
k0 = num_to_remove_per_class[0]
# The number of examples to prune in P and N. Leave at least 10 examples.
k1 = max(min(int(k1), size_P_noisy - MIN_NUM_PER_CLASS), 0)
k0 = max(min(int(k0), size_N_noisy - MIN_NUM_PER_CLASS), 0)
if verbose:
print('k1: ', k1, ', k0: ', k0)
# Peform Pruning with threshold probabilities from BFPRT algorithm in O(n)
# Don't prune if pi1 = 0 or there are not MIN_NUM_PER_CLASS in P_noisy
if (pi1 > 0 and size_P_noisy > MIN_NUM_PER_CLASS) or num_to_remove_per_class is not None:
kth_smallest = np.partition(prob_s_eq_1[s == 1], k1)[k1]
else:
kth_smallest = -1.0
# Don't prune if pi0 = 0 or there are not MIN_NUM_PER_CLASS in N_noisy
if (pi0 > 0 and size_N_noisy > MIN_NUM_PER_CLASS) or num_to_remove_per_class is not None:
kth_largest = -np.partition(-prob_s_eq_1[s == 0], k0)[k0]
else:
kth_largest = 2.0
if verbose:
print('kth_smallest: ', kth_smallest, ', kth_largest: ', kth_largest)
noise_mask = ((prob_s_eq_1 > kth_largest) & (s == 0)) | ((prob_s_eq_1 < kth_smallest) & (s == 1))
return noise_mask
# In[ ]:
def _mean_without_nan_inf(arr, replacement = None):
'''Private helper method for computing the mean
of a numpy array or iterable by replacing NaN and inf
values with a replacement value or ignore those values
if replacement = None.
Parameters
----------
arr : iterable (list or np.array)
Any iterable that may contain NaN or inf values.
replacement : float
Replace NaN and inf values in arr with this value.
'''
if replacement is not None:
return np.mean(
[replacement if math.isnan(x) or math.isinf(x) else x for x in arr]
)
x_real = [x for x in arr if not math.isnan(x) and not math.isinf(x)]
if len(x_real) == 0:
raise ValueError("All rho_conf estimates are NaN. Check that" "positive_lb_threshold and negative_ub_threshold values are not" "too extreme (near 1 or 0), resulting in division by zero.")
else:
return np.mean(x_real)
# In[ ]:
class RankPruning(object):
'''
Rank Pruning is a state-of-the-art algorithm (2017) for
binary semi-supervised classification P̃Ñ learning with signficant mislabeling in
both the noisy Negative (N) and noisy Positive (P) sets.
Rank Pruning also achieves state-of-the-art performance for positive-unlabeled
learning (PU learning) where a subset of positive examples is given and
all other examples are unlabeled and assumed to be negative examples.
Rank Pruning works by "learning from confident examples." Confident examples are
identified as those examples with predicted probability near their training label.
Given any classifier having the predict_proba() method, an input feature matrix, X,
and a binary vector of labels, s, which may contain mislabeling, Rank Pruning
estimates the classifications that would be obtained if the hidden, true labels, y,
had instead been provided to the classifier during training.
Parameters
----------
clf : sklearn.classifier or equivalent
Stores the classifier used in Rank Pruning.
Default classifier used is logistic regression.
frac_pos2neg : float
P(s=0|y=1). Fraction of positive examples mislabeled as negative examples. Typically,
leave this set to its default value of None. Only provide this value if you know the
fraction of mislabeling already. This value is called rho1 in the literature.
frac_neg2pos : float
P(s=1|y=0). Fraction of negative examples mislabeled as positive examples. Typically,
leave this set to its default value of None. Only provide this value if you know the
fraction of mislabeling already. This value is called rho0 in the literature.
'''
def __init__(self,
frac_pos2neg = None,
frac_neg2pos = None,
clf = None,
):
if frac_pos2neg is not None and frac_neg2pos is not None:
# Verify that rh1 + rh0 < 1 and pi0 + pi1 < 1.
if frac_pos2neg + frac_neg2pos >= 1:
raise Exception("frac_pos2neg + frac_neg2pos < 1 is " + "a necessary condition for Rank Pruning.")
self.rh1 = frac_pos2neg
self.rh0 = frac_neg2pos
self.clf = logreg() if clf is None else clf
def get_fraction_of_positives_mislabeled_as_negative(self):
'''Accessor method for inverse positive noise rate.'''
return self.rh1
def get_fraction_of_negatives_mislabeled_as_positive(self):
'''Accessor method for inverse negative noise rate.'''
return self.rh0
def get_fraction_mislabeling_in_positive_set(self):
'''Accessor method for positive noise rate.'''
return self.pi1
def get_fraction_mislabeling_in_negative_set(self):
'''Accessor method for negative noise rate.'''
return self.pi0
def fit(
self,
X,
s,
cv_n_folds = 3,
pulearning = None,
prob_s_eq_1 = None,
positive_lb_threshold = None,
negative_ub_threshold = None,
get_pruned_ind = False,
verbose = False,
):
'''This method implements the Rank Pruning mantra 'learning with confident examples.'
This function fits the classifer (self.clf) to (X, s) accounting for the noise in
both the positive and negative sets.
Parameters
----------
X : np.array
Input feature matrix (N, D), 2D numpy array
s : np.array
A binary vector of labels, s, which may contain mislabeling
cv_n_folds : int
The number of cross-validation folds used to compute
out-of-sample probabilities for each example in X.
pulearning : bool
Set to True if you wish to perform PU learning. PU learning assumes
that positive examples are perfectly labeled (contain no mislabeling)
and therefore frac_neg2pos = 0 (rh0 = 0). If
you are not sure, leave pulearning = None (default).
prob_s_eq_1 : iterable (list or np.array)
The probability, for each example, whether it is s==1 P(s==1|x).
If you are not sure, leave prob_s_eq_q = None (default) and
it will be computed for you using cross-validation.
positive_lb_threshold : float
P(s^=1|s=1). If an example has a predicted probability "greater" than
this threshold, it is counted as having hidden label y = 1. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
negative_ub_threshold : float
P(s^=1|s=0). If an example has a predicted probability "lower" than
this threshold, it is counted as having hidden label y = 0. This is
not used for pruning, only for estimating the noise rates using
confident counts. This value should be between 0 and 1. Default is None.
verbose : bool
Set to true if you wish to print additional information while running.
'''
# Check if we are in the PU learning setting.
if pulearning is None:
pulearning = (self.rh0 == 0)
assert_inputs_are_valid(X, s, prob_s_eq_1)
# Compute noise rates (fraction of mislabeling) for the
# positive and negative sets. Also compute P(s=1|x) if needed.
if prob_s_eq_1 is None or self.rh1 is None or self.rh0 is None:
if prob_s_eq_1 is None:
rh1, rh0, prob_s = compute_noise_rates_and_cv_pred_proba(
X = X,
s = s,
clf = self.clf,
cv_n_folds = cv_n_folds,
positive_lb_threshold = positive_lb_threshold,
negative_ub_threshold = negative_ub_threshold,
verbose = verbose,
)
# Only P(s=1|x) is needed for binary case
prob_s_eq_1 = prob_s[:,1]
del prob_s
else:
rh1, rh0 = compute_conf_counts_noise_rates_from_probabilities(
s = s,
prob_s_eq_1 = prob_s_eq_1,
positive_lb_threshold = positive_lb_threshold,
negative_ub_threshold = negative_ub_threshold,
verbose = verbose,
)
# Set the noise rates to user-provided values, if provided.
self.rh1 = self.rh1 if self.rh1 is not None else rh1
self.rh0 = self.rh0 if self.rh0 is not None else rh0
# Set rh0 if we are in the pulearning setting
self.rh0 = 0.0 if pulearning else self.rh0
# Compute ps1 := P(s=1), py1 := P(y=1), and inverse noise rates pi1, pi0
self.ps1, self.py1, self.pi1, self.pi0 = compute_ps1_py1_pi1_pi0(s, self.rh1, self.rh0)
# Get the indices of the examples we wish to prune
prune_mask = get_noise_indices(s, prob_s_eq_1, pi1 = self.pi1, pi0 = self.pi0)
# keep the remaining training set X_conf, s_conf
X_mask = ~prune_mask
X_pruned = X[X_mask]
s_pruned = s[X_mask]
if get_pruned_ind:
return (X_pruned, s_pruned)
# Re-weight examples in the loss function for the final fitting
# s.t. the "apparent" original number of examples in P and N
# is preserved, even though the pruned set may differ.
sample_weight = np.ones(np.shape(s_pruned)) / float(1 - self.rh1)
sample_weight[s_pruned == 0] = 1.0 / float(1 - self.rh0)
self.clf.fit(X_pruned, s_pruned, sample_weight = sample_weight)
def predict(self, X):
'''
Returns a binary vector of predictions.
'''
return self.clf.predict(X)
def predict_proba(self, X):
'''
Returns a vector of probabilties for only P(y=1) for each example in X.
'''
return self.clf.predict_proba(X)[:,1]