In [1]:
# Only consider the first 500 (FP: 1454, Precision: 0.0352)
# DATE_TIME already excluded
# Reason that DATE_TIME is not necessary:
# 1. DATE_TIME cannot associate to people names (don't know who said the date_time)
# 2. DATE_TIME can be relevant to math problem descriptions if finally applied to PLUS tutoring team.
# 3. ...?
# Check Recall
import ast
from typing import List, Tuple

# Define the type alias for PII entity
pii_entity = Tuple[int, str, str, Tuple[int, int]]

def load_entities(file_path: str) -> List[pii_entity]:
    entities = []
    with open(file_path, 'r') as file:
        for line in file:
            entity = ast.literal_eval(line.strip())
            entities.append(entity)
    return entities

def normalize_text(text: str) -> str:
    return text.strip().lower()

def calculate_metrics(true_entities: List[pii_entity], detected_entities: List[pii_entity]):
    # Exclude the entity type from the comparison
    true_set = set((i, start, end) for i, _, _, (start, end) in true_entities)
    detected_set = set((i, start, end) for i, _, _, (start, end) in detected_entities)

    tp = true_set & detected_set
    fp = detected_set - true_set
    fn = true_set - detected_set

    tp_count = len(tp)
    fp_count = len(fp)
    fn_count = len(fn)

    precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0
    recall = tp_count / (tp_count + fn_count) if (tp_count + fn_count) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Debugging output
    print(f"True Entities (Sample): {list(true_set)[:5]}")
    print(f"Detected Entities (Sample): {list(detected_set)[:5]}")
    print(f"True Positives (Sample): {sorted(list(tp))}")
    print(f"False Positives (Sample): {list(fp)[:5]}")
    print(f"False Negatives (Sample): {sorted(list(fn))}")

    return tp_count, fp_count, fn_count, precision, recall, f1_score

# Load true entities
true_entities = load_entities('pii_entities.txt')

# Load detected entities
detected_entities = load_entities('pii_entities_detected.txt')

# Calculate metrics
tp_count, fp_count, fn_count, precision, recall, f1_score = calculate_metrics(true_entities, detected_entities)

print(f"True Positives (TP): {tp_count}")
print(f"False Positives (FP): {fp_count}")
print(f"False Negatives (FN): {fn_count}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")


True Entities (Sample): [(11564, 6636, 6645), (18129, 1391, 1403), (4272, 4, 16), (21095, 32, 46), (18718, 0, 10)]
Detected Entities (Sample): [(22384, 339, 343), (8785, 2306, 2317), (13573, 668, 674), (615, 22, 28), (917, 3447, 3457)]
True Positives (Sample): [(4, 36, 48), (7, 2281, 2295), (7, 3648, 3662), (8, 83, 94), (10, 0, 13), (10, 2386, 2399), (11, 46, 56), (16, 22, 37), (19, 262, 272), (19, 420, 430), (20, 32, 44), (22, 934, 940), (56, 53, 64), (80, 2304, 2318), (80, 2322, 2336), (86, 37, 49), (93, 0, 17), (104, 41, 52), (133, 1496, 1502), (133, 1625, 1631), (133, 1712, 1718), (133, 1788, 1794), (136, 121, 123), (166, 0, 12), (218, 0, 14), (269, 3805, 3819), (274, 0, 14), (302, 3, 15), (302, 122, 134), (308, 0, 9), (308, 3031, 3040), (308, 6154, 6163), (308, 7658, 7667), (311, 4800, 4814), (317, 3036, 3084), (328, 39, 53), (330, 88, 103), (333, 87, 98), (334, 3026, 3037), (344, 42, 54), (355, 70, 85), (356, 3477, 3489), (368, 3278, 3293), (375, 28, 40), (379, 93, 109), (379, 15

In [None]:
# Calculate Recall, Precision, F1 score for different categories.

In [2]:
# TODO: Deal with overlapping entities.
def check_overlapping_entities(entities: List[pii_entity]):
    # Sort entities by index and start position
    entities_sorted = sorted(entities, key=lambda x: (x[0], x[3][0]))
    overlaps = []
    
    for i in range(len(entities_sorted) - 1):
        current_entity = entities_sorted[i]
        next_entity = entities_sorted[i + 1]
        
        # Check if the current entity overlaps with the next one
        if current_entity[0] == next_entity[0] and current_entity[3][1] > next_entity[3][0]:
            overlaps.append((current_entity, next_entity))
    
    # Print the overlapping entities
    if overlaps:
        print(f"Found {len(overlaps)} overlapping entities:")
        for overlap in overlaps:
            print(f"Overlap between: {overlap[0]} and {overlap[1]}")
    else:
        print("No overlapping entities found.")

check_overlapping_entities(detected_entities)

Found 424 overlapping entities:
Overlap between: (5, 'https://www.greatplacetowork.com/resources/blog/why-is-diversity-inclusion-in-the-workplace-important', 'PERSON', (4150, 4251)) and (5, 'https://www.greatplacetowork.com/resources/blog/why-is-diversity-inclusion-in-the-workplace-important', 'URL', (4150, 4251))
Overlap between: (92, '3.INSIGHT', 'PERSON', (2388, 2397)) and (92, '3.IN', 'URL', (2388, 2392))
Overlap between: (123, 'https://cyberleninka.ru/article/n/stremlenie-k-spravedlivomu-sotrudnichestvu-kak- motiv-ekonomicheskogo-povedeniya', 'PERSON', (7742, 7856)) and (123, 'https://cyberleninka.ru/article/n/stremlenie-k-spravedlivomu-sotrudnichestvu-kak-', 'URL', (7742, 7823))
Overlap between: (209, 'map.it', 'LOCATION', (862, 868)) and (209, 'map.it', 'URL', (862, 868))
Overlap between: (379, 'djones@gmail.com', 'EMAIL_ADDRESS', (152, 168)) and (379, 'gmail.com', 'URL', (159, 168))
Overlap between: (407, 'Makeovermonday.co.uk', 'PERSON', (1497, 1517)) and (407, 'Makeovermonday