In [1]:
import ast
from collections import Counter

name_set_true = set()
url_set_true = set()
email_set_true = set()
phoneNum_set_true = set()
location_set_true = set()

# processing true categories
true_categories = Counter()
with open('pii_entities.txt', 'r') as file:
    for line in file:
        line = line.strip()
        
        # convert the string to a tuple
        tup = ast.literal_eval(line)
        
        # Only process the first 500 docs
        doc_idx = tup[0]
        # if doc_idx < 15000:
        # increment this category's count in dictionary
        category = tup[2]
        true_categories[category] += 1
        
        new_tup = (tup[0],tup[1],tup[3])
        if category == 'NAME_STUDENT':
            name_set_true.add(new_tup)
        elif category == 'URL_PERSONAL':
            url_set_true.add(new_tup)
        elif category == 'EMAIL':
            email_set_true.add(new_tup)
        elif category == 'PHONE_NUM':
            phoneNum_set_true.add(new_tup)
        elif category == 'STREET_ADDRESS':
            location_set_true.add(new_tup)
        
# print categories
print("All true entites summary:")
for key, count in true_categories.items():
    print(f"{key}: {count}")

All true entites summary:
NAME_STUDENT: 4394
URL_PERSONAL: 352
EMAIL: 111
ID_NUM: 218
USERNAME: 19
PHONE_NUM: 14
STREET_ADDRESS: 9


In [2]:
name_set_detected = set()
url_set_detected = set()
email_set_detected = set()
phoneNum_set_detected = set()
location_set_detected = set()

# processing detected categories
detected_categories = Counter()
with open('pii_entities_detected.txt', 'r') as file:
    for line in file:
        line = line.strip()
        
        # convert the string to a tuple
        tup = ast.literal_eval(line)
        
        # increment this category's count in dictionary
        category = tup[2]
        detected_categories[category] += 1
        
        new_tup = (tup[0],tup[1],tup[3])
        if category == 'PERSON':
            name_set_detected.add(new_tup)
        elif category == 'URL':
            url_set_detected.add(new_tup)
        elif category == 'EMAIL_ADDRESS':
            email_set_detected.add(new_tup)
        elif category == 'PHONE_NUMBER':
            phoneNum_set_detected.add(new_tup)
        elif category == 'LOCATION':
            location_set_detected.add(new_tup)

print("All detected entites summary:")
# print categories
for key, count in detected_categories.items():
    print(f"{key}: {count}")

All detected entites summary:
PERSON: 18623
URL: 4254
LOCATION: 14227
EMAIL_ADDRESS: 132
PHONE_NUMBER: 85


In [None]:
# true categories: ID_NUM and USERNAME have no mapping
# Don't know whether STREET_ADDRESS (true) maps to LOCATION (detected)

# only calculating metrics for NAME_STUDENT/PERSON, URL_PERSONAL/URL, EMAIL/EMAIL_ADDRESS, PHONE_NUM/PHONE_NUMBER mappings
# STREET_ADDRESS/LOCATION is also tested below

In [3]:
def calculate_metrics(synthetic_set, analyzed_set):
    tp = 0
    fp = 0
    fn = 0
    
    for analyzed in analyzed_set:
        if analyzed in synthetic_set:
            tp += 1 # true PII, detected by presidio
            # print(f"TP: {analyzed}")
        else: # false PII, detected by presidio
            fp += 1
            # print(f"FP: {analyzed}")
    
    for synthetic in synthetic_set:
        if synthetic not in analyzed_set:
            fn += 1
            # print(f"FN: {synthetic}") 
    
    return tp, fp, fn

def calculate_metrics_set_method(synthetic_set, analyzed_set):
    # Exclude the entity type from the comparison
    true_set = set((i, start, end) for i, _, (start, end) in synthetic_set)
    detected_set = set((i, start, end) for i, _, (start, end) in analyzed_set)

    tp = true_set & detected_set
    fp = detected_set - true_set
    fn = true_set - detected_set

    tp_count = len(tp)
    fp_count = len(fp)
    fn_count = len(fn)
    
    return tp_count, fp_count, fn_count

def compute_precision_recall_f1(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [21]:
# Calculate TP, FP, FN, TN
# tp_n, fp_n, fn_n = calculate_metrics(name_set_true, name_set_detected)
# tp_u, fp_u, fn_u = calculate_metrics(url_set_true, url_set_detected)
# tp_e, fp_e, fn_e = calculate_metrics(email_set_true, email_set_detected)
# tp_p, fp_p, fn_p = calculate_metrics(phoneNum_set_true, phoneNum_set_detected)
# tp_l, fp_l, fn_l = calculate_metrics(location_set_true, location_set_detected)

tp_n, fp_n, fn_n = calculate_metrics_set_method(name_set_true, name_set_detected)
tp_u, fp_u, fn_u = calculate_metrics_set_method(url_set_true, url_set_detected)
tp_e, fp_e, fn_e = calculate_metrics_set_method(email_set_true, email_set_detected)
tp_p, fp_p, fn_p = calculate_metrics_set_method(phoneNum_set_true, phoneNum_set_detected)
tp_l, fp_l, fn_l = calculate_metrics_set_method(location_set_true, location_set_detected)

# Compute Precision, Recall, and F1 Score
precision_n, recall_n, f1_n = compute_precision_recall_f1(tp_n, fp_n, fn_n)
precision_u, recall_u, f1_u = compute_precision_recall_f1(tp_u, fp_u, fn_u)
precision_e, recall_e, f1_e = compute_precision_recall_f1(tp_e, fp_e, fn_e)
precision_p, recall_p, f1_p = compute_precision_recall_f1(tp_p, fp_p, fn_p)
precision_l, recall_l, f1_l = compute_precision_recall_f1(tp_l, fp_l, fn_l)

print("Summary of Precision, Recall, and F1 Score")
print(f"NAME_STUDENT    -- Precision: {precision_n:.4f}, Recall: {recall_n:.4f}, F1 Score: {f1_n:.4f}")
print(f"URL_PERSONAL    -- Precision: {precision_u:.4f}, Recall: {recall_u:.4f}, F1 Score: {f1_u:.4f}")
print(f"EMAIL           -- Precision: {precision_e:.4f}, Recall: {recall_e:.4f}, F1 Score: {f1_e:.4f}")
print(f"PHONE_NUM       -- Precision: {precision_p:.4f}, Recall: {recall_p:.4f}, F1 Score: {f1_p:.4f}")
print(f"STREET_ADDRESS  -- Precision: {precision_l:.4f}, Recall: {recall_l:.4f}, F1 Score: {f1_l:.4f}")

Summary of Precision, Recall, and F1 Score
NAME_STUDENT    -- Precision: 0.1623, Recall: 0.6878, F1 Score: 0.2626
URL_PERSONAL    -- Precision: 0.0691, Recall: 0.8352, F1 Score: 0.1277
EMAIL           -- Precision: 0.8333, Recall: 0.9910, F1 Score: 0.9053
PHONE_NUM       -- Precision: 0.1647, Recall: 1.0000, F1 Score: 0.2828
STREET_ADDRESS  -- Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000


In [24]:
def print_likely_successful_entities(true_set, detected_set):
    output_list = []
    count = 0

    for entity_true in true_set:
        if entity_true not in detected_set:
            idx, entity_text, (start, end) = entity_true
            match_found = False  # Initialize a flag to track if a match is found

            for entity_detected in detected_set:
                idx_d, entity_text_d, (start_d, end_d) = entity_detected
                if idx == idx_d and (start == start_d or end == end_d):
                    count += 1
                    output_list.append(f"{entity_true} ---- {entity_detected}")
                    match_found = True  # Set the flag to True when a match is found
                    break  # No need to check further once a match is found

            if not match_found:  # Only add the original entity if no match was found
                output_list.append(str(entity_true))
    
    # Sort the output list by document index (idx)
    sorted_output = sorted(output_list, key=lambda x: int(x.split(",")[0][1:]))

    # Print the sorted output
    print(f"Out of {len(sorted_output)} entities for NAME category, {count} entities might have likely been successfully detected by Presidio.")
    for line in sorted_output:
        print(line)


In [25]:
print_likely_successful_entities(name_set_true, name_set_detected)

Out of 1372 entities for NAME category, 939 entities might have likely been successfully detected by Presidio.
(7, 'Nathalie Sylla', (52, 66)) ---- (7, 'Nathalie Sylla\n\n', (52, 68))
(80, 'Karol Ferreira', (2, 16)) ---- (80, 'Karol Ferreira\n\nREFLECTION - VISUALIZATION\n\nChallenge\n\nWorking', (2, 64))
(88, 'Rakesh Singh', (61, 73)) ---- (88, 'Rakesh Singh\n\nChallenge', (61, 84))
(99, 'Francesco Boscolo', (38, 55)) ---- (99, 'Francesco Boscolo\n\nChallenge', (38, 66))
(109, 'Michael', (3336, 3343)) ---- (109, 'Michael  -', (3336, 3346))
(109, 'Michael', (1300, 1307)) ---- (109, 'Michael  -', (1300, 1310))
(109, 'Michael', (71, 78)) ---- (109, 'Michael  -', (71, 81))
(112, 'Francisco Ferreira', (30, 48))
(113, 'Rita', (0, 4))
(123, 'Stefano Lovato', (156, 170)) ---- (123, 'Stefano Lovato\n\nMDI-191', (156, 179))
(161, 'Juan Farid', (3758, 3768)) ---- (161, 'Juan Farid  ', (3758, 3770))
(202, 'Danny Long', (0, 10)) ---- (202, 'Danny Long Reflection', (0, 21))
(204, 'Deiby', (29, 34))

In [29]:
print_likely_successful_entities(url_set_true, url_set_detected)

Out of 58 entities for NAME category, 55 entities might have likely been successfully detected by Presidio.
(1309, 'https://www.hall.biz/wp-contenthome.html', (2581, 2621)) ---- (1309, 'https://www.hall.bi', (2581, 2600))
(3202, 'https://www.youtube.com/channel/UC1ElAcppeuhfet nYZqnhEXw', (200, 257)) ---- (3202, 'https://www.youtube.com/channel/UC1ElAcppeuhfet', (200, 247))
(3202, 'tps://www.facebook.com/bclark', (169, 198)) ---- (3202, 'www.facebook.com/bclark', (175, 198))
(3515, 'https://www.youtube.com/watch?v=mYxoZaftuNN', (2166, 2209)) ---- (3515, 'https://www.youtube.com/watch?v=mYxoZaftuNN.', (2166, 2210))
(3592, 'https://www.peterson.net/tag/app/listmain.php', (222, 267)) ---- (3592, 'https://www.peterson.net/tag/app/listmain.php.', (222, 268))
(5358, 'https://schaefer.biz/posts/search/appsearch.php', (1543, 1590)) ---- (5358, 'https://schaefer.bi', (1543, 1562))
(5861, 'https://www.stevens.biz/wp-contentindex.jsp', (3227, 3270)) ---- (5861, 'https://www.stevens.bi', (3227, 32

In [30]:
print_likely_successful_entities(email_set_true, email_set_detected)

Out of 1 entities for NAME category, 1 entities might have likely been successfully detected by Presidio.
(11699, 'srpe…r@....kelsey21@gmail.com', (6157, 6186)) ---- (11699, 'kelsey21@gmail.com', (6168, 6186))


In [31]:
print_likely_successful_entities(phoneNum_set_true, phoneNum_set_detected)

Out of 0 entities for NAME category, 0 entities might have likely been successfully detected by Presidio.


In [32]:
print_likely_successful_entities(location_set_true, location_set_detected)

Out of 9 entities for NAME category, 4 entities might have likely been successfully detected by Presidio.
(9854, '591 Smith Centers Apt. 656\nJoshuamouth, RI 95963', (16, 64))
(10447, '6828 Harris Squares Suite 829\nChristinatown, SD 16180', (1958, 2011)) ---- (10447, 'SD 16180', (2003, 2011))
(10447, '6828 Harris Squares Suite 829\nChristinatown, SD 16180', (14, 67)) ---- (10447, 'SD 16180', (59, 67))
(10447, '6828 Harris Squares Suite 829\nChristinatown, SD 16180', (5883, 5936)) ---- (10447, 'SD 16180', (5928, 5936))
(11442, '743 Erika Bypass Apt. 419\nAndreahaven, IL 54207', (714, 761))
(13733, '61955 Kelly Ford\nAndretown, MH 48154', (108, 144)) ---- (13733, 'MH 48154', (136, 144))
(13733, '0475 Mclean Flats\nSouth Sara, NY 27576', (1013, 1051))
(14025, '5468 Keith Islands Suite 664\nLake John, VI 73879', (4703, 4751))
(18745, '2828 Kristy Pass Suite 749\nThompsonbury, VI 21802', (306, 355))


DDL: August 26, 2024 (Aiming at presentation towards Ken)
1. Post processing - AL
2. Other models - YW, JS
3. Random sampling (2000 cases) or stratified sampling for each category - JS
4. find mappings for ID_NUM, STREET_ADDRESS, USERNAME - JS

EDM paper, sequence labeling, we prompt GPT to do BIO, performance might not be good
Why we use GPT model? Other models?
Let's do mini first.