-
Notifications
You must be signed in to change notification settings - Fork 4
/
metrics.py
93 lines (69 loc) · 3.74 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from fuzzywuzzy import fuzz
def exact_match_score(predictions, ground_truths):
"""
This function computes the average exact match score between the predicted codes and the ground truth codes.
It returns a float value between 0 and 1 indicating the degree of exact match between the predicted codes
and the ground truth codes, where a value of 1 means all the predicted codes exactly match their corresponding
ground truth codes and a value of 0 means none of the predicted codes exactly match their corresponding
ground truth codes.
Args:
predictions: list, predicted codes
ground_truths: list, ground truth codes
Returns:
Float, the average exact match score between the predicted codes and the ground truth codes.
"""
if len(predictions) != len(ground_truths):
raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
exact_match = 0
for pred, gt in zip(predictions, ground_truths):
if pred.split() == gt.split():
exact_match += 1
return round(exact_match / len(predictions), 5)
def edit_similarity_score(predictions, ground_truths):
"""
This function computes the average edit similarity score between the predicted codes and the ground truth codes.
It returns a float value between 0 and 1 indicating the degree of similarity between the predicted codes
and the ground truth codes, where a value of 1 means all the predicted codes are identical to their corresponding
ground truth codes and a value of 0 means none of the predicted codes are similar to their corresponding
ground truth codes.
Args:
predictions: list, predicted codes
ground_truths: list, ground truth codes
Returns:
Float, the average edit similarity score between the predicted codes and the ground truth codes.
"""
if len(predictions) != len(ground_truths):
raise ValueError("The length of the predicted codes and the ground truth codes should be equal.")
edit_sim = 0.0
for pred, gt in zip(predictions, ground_truths):
edit_sim += fuzz.ratio(pred, gt)
return round(edit_sim / len(predictions), 5)
def accuracy_at_k(prediction_list, golden_index_list, k):
"""
This function computes the accuracy at k. It returns a float value between 0 and 1 indicating the
accuracy at k, where a value of 1 means the correct code is retrieved at the top k positions and
a value of 0 means the correct code is not retrieved at the top k positions.
Args:
prediction_list: list, a list of lists, where each list contains the indices of the retrieved codes.
golden_index_list: list, a list of integers, where each integer is the index of the correct code.
k: int, the number of retrieved codes.
Returns:
Float, the accuracy at k.
"""
if len(golden_index_list) == 0:
raise ValueError("The list of golden indices should not be empty.")
assert len(golden_index_list) == len(prediction_list), \
"The length of the golden indices list should be equal to the length of the prediction list, however, " \
f"the length of the golden indices list is {len(golden_index_list)} and the length of the prediction list is {len(prediction_list)}."
acc = 0
for i in range(len(prediction_list)):
golden_index = golden_index_list[i]
index_list = prediction_list[i]
if len(index_list) < k:
raise ValueError("The number of retrieved codes should be greater than k.")
top_k_indices = index_list[:k]
if golden_index not in top_k_indices:
continue
else:
acc += 1
return round(acc / len(golden_index_list), 5)