In [17]:
import ast
from collections import Counter

name_set_true = set()
url_set_true = set()
email_set_true = set()

# processing true categories
true_categories = Counter()
with open('pii_entities.txt', 'r') as file:
    for line in file:
        line = line.strip()
        
        # convert the string to a tuple
        tup = ast.literal_eval(line)
        
        # increment this category's count in dictionary
        category = tup[2]
        true_categories[category] += 1
        
        new_tup = (tup[0],tup[1],tup[3])
        if category == 'NAME_STUDENT':
            name_set_true.add(new_tup)
        elif category == 'URL_PERSONAL':
            url_set_true.add(new_tup)
        elif category == 'EMAIL':
            email_set_true.add(new_tup)
        
# print categories
for key, count in true_categories.items():
    print(f"{key}: {count}")


NAME_STUDENT: 4394
URL_PERSONAL: 352
EMAIL: 111
ID_NUM: 218
USERNAME: 19
PHONE_NUM: 14
STREET_ADDRESS: 9


In [18]:
name_set_detected = set()
url_set_detected = set()
email_set_detected = set()

# processing detected categories
detected_categories = Counter()
with open('pii_entities_detected.txt', 'r') as file:
    for line in file:
        line = line.strip()
        
        # convert the string to a tuple
        tup = ast.literal_eval(line)
        
        # increment this category's count in dictionary
        category = tup[2]
        detected_categories[category] += 1
        
        new_tup = (tup[0],tup[1],tup[3])
        if category == 'PERSON':
            name_set_detected.add(new_tup)
        elif category == 'URL':
            url_set_detected.add(new_tup)
        elif category == 'EMAIL_ADDRESS':
            email_set_detected.add(new_tup)
    
    
# print categories
for key, count in detected_categories.items():
    print(f"{key}: {count}")

PERSON: 310
URL: 69
LOCATION: 232
EMAIL_ADDRESS: 1


In [None]:
# true categories: ID_NUM, USERNAME, PHONE_NUM, STREET_ADDRESS have no mapping
# detected categories: LOCATION has no mapping

# only calculating metrics for NAME_STUDENT/PERSON, URL_PERSONAL/URL, EMAIL/EMAIL_ADDRESS mappings


In [24]:
def calculate_metrics(synthetic_set, analyzed_set):
    tp = 0
    fp = 0
    fn = 0
    
    for analyzed in analyzed_set:
        if analyzed in synthetic_set:
            tp += 1 # true PII, detected by presidio
            # print(f"TP: {analyzed}")
        else: # false PII, detected by presidio
            fp += 1
            # print(f"FP: {analyzed}")
    
    for synthetic in synthetic_set:
        if synthetic not in analyzed_set:
            fn += 1
            # print(f"FN: {synthetic}")

    return tp, fp, fn

def compute_precision_recall_f1(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [25]:
# Calculate TP, FP, FN, TN
tp_n, fp_n, fn_n = calculate_metrics(name_set_true, name_set_detected)
tp_u, fp_u, fn_u = calculate_metrics(url_set_true, url_set_detected)
tp_e, fp_e, fn_e = calculate_metrics(email_set_true, email_set_detected)

# Compute Precision, Recall, and F1 Score
precision_n, recall_n, f1_n = compute_precision_recall_f1(tp_n, fp_n, fn_n)
precision_u, recall_u, f1_u = compute_precision_recall_f1(tp_u, fp_u, fn_u)
precision_e, recall_e, f1_e = compute_precision_recall_f1(tp_e, fp_e, fn_e)

print(f"Name -- Precision: {precision_n}, Recall: {recall_n}, F1 Score: {f1_n}")
print(f"URL -- Precision: {precision_u}, Recall: {recall_u}, F1 Score: {f1_u}")
print(f"Email -- Precision: {precision_e}, Recall: {recall_e}, F1 Score: {f1_e}")

Name -- Precision: 0.15806451612903225, Recall: 0.011151570323167955, F1 Score: 0.020833333333333332
URL -- Precision: 0.043478260869565216, Recall: 0.008522727272727272, F1 Score: 0.014251781472684084
Email -- Precision: 1.0, Recall: 0.009009009009009009, F1 Score: 0.01785714285714286
