-
Notifications
You must be signed in to change notification settings - Fork 57
/
classification_metrics.py
409 lines (333 loc) · 16.6 KB
/
classification_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
import numpy as np
import pandas as pd
from scipy.stats import entropy
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
def entropy_based_uncertainty_decomposition(y_prob_samples):
""" Entropy based decomposition [2]_ of predictive uncertainty into aleatoric and epistemic components.
References:
.. [2] Depeweg, S., Hernandez-Lobato, J. M., Doshi-Velez, F., & Udluft, S. (2018, July). Decomposition of
uncertainty in Bayesian deep learning for efficient and risk-sensitive learning. In International Conference
on Machine Learning (pp. 1184-1193). PMLR.
Args:
y_prob_samples: list of array-like of shape (n_samples, n_classes) containing class prediction probabilities
corresponding to samples from the model posterior.
Returns:
tuple:
- total_uncertainty: entropy of the predictive distribution.
- aleatoric_uncertainty: aleatoric component of the total_uncertainty.
- epistemic_uncertainty: epistemic component of the total_uncertainty.
"""
y_preds_samples_stacked = np.stack(y_prob_samples)
preds_mean = np.mean(y_preds_samples_stacked, 0)
total_uncertainty = entropy(preds_mean, axis=1)
aleatoric_uncertainty = np.mean(
np.concatenate([entropy(y_pred, axis=1).reshape(-1, 1) for y_pred in y_prob_samples], axis=1),
axis=1)
epistemic_uncertainty = total_uncertainty - aleatoric_uncertainty
return total_uncertainty, aleatoric_uncertainty, epistemic_uncertainty
def multiclass_brier_score(y_true, y_prob):
"""Brier score for multi-class.
Args:
y_true: array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like of shape (n_samples, n_classes).
Probability scores from the base model.
Returns:
float: Brier score.
"""
assert len(y_prob.shape) > 1, "y_prob should be array-like of shape (n_samples, n_classes)"
y_target = np.zeros_like(y_prob)
y_target[:, y_true] = 1.0
return np.mean(np.sum((y_target - y_prob) ** 2, axis=1))
def area_under_risk_rejection_rate_curve(y_true, y_prob, y_pred=None, selection_scores=None, risk_func=accuracy_score,
attributes=None, num_bins=10, subgroup_ids=None,
return_counts=False):
""" Computes risk vs rejection rate curve and the area under this curve. Similar to risk-coverage curves [3]_ where
coverage instead of rejection rate is used.
References:
.. [3] Franc, Vojtech, and Daniel Prusa. "On discriminative learning of prediction uncertainty."
In International Conference on Machine Learning, pp. 1963-1971. 2019.
Args:
y_true: array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like of shape (n_samples, n_classes).
Probability scores from the base model.
y_pred: array-like of shape (n_samples,)
predicted labels.
selection_scores: scores corresponding to certainty in the predicted labels.
risk_func: risk function under consideration.
attributes: (optional) if risk function is a fairness metric also pass the protected attribute name.
num_bins: number of bins.
subgroup_ids: (optional) selectively compute risk on a subgroup of the samples specified by subgroup_ids.
return_counts: set to True to return counts also.
Returns:
float or tuple:
- aurrrc (float): area under risk rejection rate curve.
- rejection_rates (list): rejection rates for each bin (returned only if return_counts is True).
- selection_thresholds (list): selection threshold for each bin (returned only if return_counts is True).
- risks (list): risk in each bin (returned only if return_counts is True).
"""
if selection_scores is None:
assert len(y_prob.shape) > 1, "y_prob should be array-like of shape (n_samples, n_classes)"
selection_scores = y_prob[np.arange(y_prob.shape[0]), np.argmax(y_prob, axis=1)]
if y_pred is None:
assert len(y_prob.shape) > 1, "y_prob should be array-like of shape (n_samples, n_classes)"
y_pred = np.argmax(y_prob, axis=1)
order = np.argsort(selection_scores)[::-1]
rejection_rates = []
selection_thresholds = []
risks = []
for bin_id in range(num_bins):
samples_in_bin = len(y_true) // num_bins
selection_threshold = selection_scores[order[samples_in_bin * (bin_id+1)-1]]
selection_thresholds.append(selection_threshold)
ids = selection_scores >= selection_threshold
if sum(ids) > 0:
if attributes is None:
if isinstance(y_true, pd.Series):
y_true_numpy = y_true.values
else:
y_true_numpy = y_true
if subgroup_ids is None:
risk_value = 1.0 - risk_func(y_true_numpy[ids], y_pred[ids])
else:
if sum(subgroup_ids & ids) > 0:
risk_value = 1.0 - risk_func(y_true_numpy[subgroup_ids & ids], y_pred[subgroup_ids & ids])
else:
risk_value = 0.0
else:
risk_value = risk_func(y_true.iloc[ids], y_pred[ids], prot_attr=attributes)
else:
risk_value = 0.0
risks.append(risk_value)
rejection_rates.append(1.0 - 1.0 * sum(ids) / len(y_true))
aurrrc = np.nanmean(risks)
if not return_counts:
return aurrrc
else:
return aurrrc, rejection_rates, selection_thresholds, risks
def expected_calibration_error(y_true, y_prob, y_pred=None, num_bins=10, return_counts=False):
""" Computes the reliability curve and the expected calibration error [1]_ .
References:
.. [1] Chuan Guo, Geoff Pleiss, Yu Sun, Kilian Q. Weinberger; Proceedings of the 34th International Conference
on Machine Learning, PMLR 70:1321-1330, 2017.
Args:
y_true: array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like of shape (n_samples, n_classes).
Probability scores from the base model.
y_pred: array-like of shape (n_samples,)
predicted labels.
num_bins: number of bins.
return_counts: set to True to return counts also.
Returns:
float or tuple:
- ece (float): expected calibration error.
- confidences_in_bins: average confidence in each bin (returned only if return_counts is True).
- accuracies_in_bins: accuracy in each bin (returned only if return_counts is True).
- frac_samples_in_bins: fraction of samples in each bin (returned only if return_counts is True).
"""
assert len(y_prob.shape) > 1, "y_prob should be array-like of shape (n_samples, n_classes)"
num_samples, num_classes = y_prob.shape
top_scores = np.max(y_prob, axis=1)
if y_pred is None:
y_pred = np.argmax(y_prob, axis=1)
if num_classes == 2:
bins_edges = np.histogram_bin_edges([], bins=num_bins, range=(0.5, 1.0))
else:
bins_edges = np.histogram_bin_edges([], bins=num_bins, range=(0.0, 1.0))
non_boundary_bin_edges = bins_edges[1:-1]
bin_centers = (bins_edges[1:] + bins_edges[:-1])/2
sample_bin_ids = np.digitize(top_scores, non_boundary_bin_edges)
num_samples_in_bins = np.zeros(num_bins)
accuracies_in_bins = np.zeros(num_bins)
confidences_in_bins = np.zeros(num_bins)
for bin in range(num_bins):
num_samples_in_bins[bin] = len(y_pred[sample_bin_ids == bin])
if num_samples_in_bins[bin] > 0:
accuracies_in_bins[bin] = np.sum(y_true[sample_bin_ids == bin] == y_pred[sample_bin_ids == bin]) / num_samples_in_bins[bin]
confidences_in_bins[bin] = np.sum(top_scores[sample_bin_ids == bin]) / num_samples_in_bins[bin]
ece = np.sum(
num_samples_in_bins * np.abs(accuracies_in_bins - confidences_in_bins) / num_samples
)
frac_samples_in_bins = num_samples_in_bins / num_samples
if not return_counts:
return ece
else:
return ece, confidences_in_bins, accuracies_in_bins, frac_samples_in_bins, bin_centers
def compute_classification_metrics(y_true, y_prob, option='all'):
"""
Computes the metrics specified in the option which can be string or a list of strings. Default option `all` computes
the [aurrrc, ece, auroc, nll, brier, accuracy] metrics.
Args:
y_true: array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like of shape (n_samples, n_classes).
Probability scores from the base model.
option: string or list of string contained the name of the metrics to be computed.
Returns:
dict: a dictionary containing the computed metrics.
"""
results = {}
if not isinstance(option, list):
if option == "all":
option_list = ["aurrrc", "ece", "auroc", "nll", "brier", "accuracy"]
else:
option_list = [option]
if "aurrrc" in option_list:
results["aurrrc"] = area_under_risk_rejection_rate_curve(y_true=y_true, y_prob=y_prob)
if "ece" in option_list:
results["ece"] = expected_calibration_error(y_true=y_true, y_prob=y_prob)
if "auroc" in option_list:
results["auroc"], _ = roc_auc_score(y_true=y_true, y_score=y_prob)
if "nll" in option_list:
results["nll"] = log_loss(y_true=y_true, y_pred=np.argmax(y_prob, axis=1))
if "brier" in option_list:
results["brier"] = multiclass_brier_score(y_true=y_true, y_prob=y_prob)
if "accuracy" in option_list:
results["accuracy"] = accuracy_score(y_true=y_true, y_pred=np.argmax(y_prob, axis=1))
return results
def plot_reliability_diagram(y_true, y_prob, y_pred, plot_label=[""], num_bins=10):
"""
Plots the reliability diagram showing the calibration error for different confidence scores. Multiple curves
can be plot by passing data as lists.
Args:
y_true: array-like or or a list of array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like or or a list of array-like of shape (n_samples, n_classes).
Probability scores from the base model.
y_pred: array-like or or a list of array-like of shape (n_samples,)
predicted labels.
plot_label: (optional) list of names identifying each curve.
num_bins: number of bins.
Returns:
tuple:
- ece_list: ece: list containing expected calibration error for each curve.
- accuracies_in_bins_list: list containing binned average accuracies for each curve.
- frac_samples_in_bins_list: list containing binned sample frequencies for each curve.
- confidences_in_bins_list: list containing binned average confidence for each curve.
"""
import matplotlib.pyplot as plt
if not isinstance(y_true, list):
y_true, y_prob, y_pred = [y_true], [y_prob], [y_pred]
if len(plot_label) != len(y_true):
raise ValueError('y_true and plot_label should be of same length.')
ece_list = []
accuracies_in_bins_list = []
frac_samples_in_bins_list = []
confidences_in_bins_list = []
for idx in range(len(plot_label)):
ece, confidences_in_bins, accuracies_in_bins, frac_samples_in_bins, bins = expected_calibration_error(y_true[idx],
y_prob[idx],
y_pred[idx],
num_bins=num_bins,
return_counts=True)
ece_list.append(ece)
accuracies_in_bins_list.append(accuracies_in_bins)
frac_samples_in_bins_list.append(frac_samples_in_bins)
confidences_in_bins_list.append(confidences_in_bins)
fig = plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for idx in range(len(plot_label)):
plt.plot(bins, frac_samples_in_bins_list[idx], 'o-', label=plot_label[idx])
plt.title("Confidence Histogram")
plt.xlabel("Confidence")
plt.ylabel("Fraction of Samples")
plt.grid()
plt.ylim([0.0, 1.0])
plt.legend()
plt.subplot(1, 2, 2)
for idx in range(len(plot_label)):
plt.plot(bins, accuracies_in_bins_list[idx], 'o-',
label="{} ECE = {:.2f}".format(plot_label[idx], ece_list[idx]))
plt.plot(np.linspace(0, 1, 50), np.linspace(0, 1, 50), 'b.', label="Perfect Calibration")
plt.title("Reliability Plot")
plt.xlabel("Confidence")
plt.ylabel("Accuracy")
plt.grid()
plt.legend()
plt.show()
return ece_list, accuracies_in_bins_list, frac_samples_in_bins_list, confidences_in_bins_list
def plot_risk_vs_rejection_rate(y_true, y_prob, y_pred, selection_scores=None, plot_label=[""], risk_func=None,
attributes=None, num_bins=10, subgroup_ids=None):
"""
Plots the risk vs rejection rate curve showing the risk for different rejection rates. Multiple curves
can be plot by passing data as lists.
Args:
y_true: array-like or or a list of array-like of shape (n_samples,)
ground truth labels.
y_prob: array-like or or a list of array-like of shape (n_samples, n_classes).
Probability scores from the base model.
y_pred: array-like or or a list of array-like of shape (n_samples,)
predicted labels.
selection_scores: ndarray or a list of ndarray containing scores corresponding to certainty in the predicted labels.
risk_func: risk function under consideration.
attributes: (optional) if risk function is a fairness metric also pass the protected attribute name.
num_bins: number of bins.
subgroup_ids: (optional) ndarray or a list of ndarray containing subgroup_ids to selectively compute risk on a
subgroup of the samples specified by subgroup_ids.
Returns:
tuple:
- aurrrc_list: list containing the area under risk rejection rate curves.
- rejection_rate_list: list containing the binned rejection rates.
- selection_thresholds_list: list containing the binned selection thresholds.
- risk_list: list containing the binned risks.
"""
import matplotlib.pyplot as plt
if not isinstance(y_true, list):
y_true, y_prob, y_pred, selection_scores, subgroup_ids = [y_true], [y_prob], [y_pred], [selection_scores], [subgroup_ids]
if len(plot_label) != len(y_true):
raise ValueError('y_true and plot_label should be of same length.')
aurrrc_list = []
rejection_rate_list = []
risk_list = []
selection_thresholds_list = []
for idx in range(len(plot_label)):
aursrc, rejection_rates, selection_thresholds, risks = area_under_risk_rejection_rate_curve(
y_true[idx],
y_prob[idx],
y_pred[idx],
selection_scores=selection_scores[idx],
risk_func=risk_func,
attributes=attributes,
num_bins=num_bins,
subgroup_ids=subgroup_ids[idx],
return_counts=True
)
aurrrc_list.append(aursrc)
rejection_rate_list.append(rejection_rates)
risk_list.append(risks)
selection_thresholds_list.append(selection_thresholds)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
for idx in range(len(plot_label)):
plt.plot(rejection_rate_list[idx], risk_list[idx], label="{} AURRRC={:.5f}".format(plot_label[idx], aurrrc_list[idx]))
plt.legend(loc="best")
plt.xlabel("Rejection Rate")
if risk_func is None:
ylabel = "Prediction Error Rate"
else:
if 'accuracy' in risk_func.__name__:
ylabel = "1.0 - " + risk_func.__name__
else:
ylabel = risk_func.__name__
plt.ylabel(ylabel)
plt.title("Risk vs Rejection Rate Plot")
plt.grid()
plt.subplot(1, 2, 2)
for idx in range(len(plot_label)):
plt.plot(selection_thresholds_list[idx], risk_list[idx], label="{}".format(plot_label[idx]))
plt.legend(loc="best")
plt.xlabel("Selection Threshold")
if risk_func is None:
ylabel = "Prediction Error Rate"
else:
if 'accuracy' in risk_func.__name__:
ylabel = "1.0 - " + risk_func.__name__
else:
ylabel = risk_func.__name__
plt.ylabel(ylabel)
plt.title("Risk vs Selection Threshold Plot")
plt.grid()
plt.show()
return aurrrc_list, rejection_rate_list, selection_thresholds_list, risk_list