In [1]:
import pandas as pd
import ast

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

data = read_file("data/obfuscated_data_06.json")

In [11]:
# Load entities from 'output/pii_detected_gpt4omini_w_result.txt'
entities_with_results = []
with open('output/pii_detected_gpt4omini_w_result.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        entities_with_results.append(entity)

# Load true entities from 'data/pii_entities.txt'
true_entities = []
with open('data/pii_entities.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        true_entities.append((entity[0], entity[1], entity[3]))

# Convert true entities to a set for fast lookup
true_entities_set = set(true_entities)

# Prepare data for the DataFrame
data = []
for entity in entities_with_results:
    file_idx, entity_text, entity_type, positions, gpt4omini_label = entity
    true_label = 'T' if (file_idx, entity_text, positions) in true_entities_set else 'F'
    data.append((file_idx, entity_text, entity_type, positions, true_label, gpt4omini_label))

# Create the DataFrame
df = pd.DataFrame(data, columns=['file_idx', 'entity_text', 'type', 'positions', 'true_label', 'gpt4omini_label'])

# Save DataFrame to a CSV file
df.to_csv('output/pii_detected_df.csv', index=False)

# Display the DataFrame
# print(df)
df

Unnamed: 0,file_idx,entity_text,type,positions,true_label,gpt4omini_label
0,0,Angela Meyer,PERSON,"(1039, 1051)",F,T
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)",F,F
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F,F
3,7,Nathalie Sylla,PERSON,"(52, 66)",T,T
4,7,Buzan T.,PERSON,"(263, 271)",F,T
...,...,...,...,...,...,...
80,4913,Sana Ahmed,PERSON,"(194, 204)",T,T
81,4913,Asmaa Ahmed,PERSON,"(237, 248)",T,T
82,4913,Father,PERSON,"(2407, 2413)",F,F
83,4913,09-22-2014,PHONE_NUMBER,"(164, 174)",F,F


In [14]:
df.groupby(['true_label', 'gpt4omini_label']).size().reset_index(name='frequency')

Unnamed: 0,true_label,gpt4omini_label,frequency
0,F,F,29
1,F,T,17
2,T,F,4
3,T,T,35


In [18]:
df[(df['true_label'] == 'F') & (df['true_label'] != df['gpt4omini_label'])]

Unnamed: 0,file_idx,entity_text,type,positions,true_label,gpt4omini_label
0,0,Angela Meyer,PERSON,"(1039, 1051)",F,T
4,7,Buzan T.,PERSON,"(263, 271)",F,T
5,7,Buzan B.,PERSON,"(276, 284)",F,T
16,20,George,PERSON,"(1606, 1612)",F,T
17,20,Geoff,PERSON,"(1617, 1622)",F,T
22,576,Abhishek Kumar Brahma,PERSON,"(170, 191)",F,T
23,576,abhishekkumarbrahma@gmail.com,EMAIL_ADDRESS,"(200, 229)",F,T
27,609,SSRK Kasyap,PERSON,"(319, 330)",F,T
34,609,557349702179,PHONE_NUMBER,"(361, 373)",F,T
45,1871,Carlos Angelo D. Aguilar,PERSON,"(0, 24)",F,T


In [17]:
df[(df['true_label'] == 'T') & (df['true_label'] != df['gpt4omini_label'])]

Unnamed: 0,file_idx,entity_text,type,positions,true_label,gpt4omini_label
12,16,Gilberto Gamboa,PERSON,"(22, 37)",T,F
26,609,Tino Swetha,PERSON,"(290, 301)",T,F
31,609,Alex Bravo,PERSON,"(438, 448)",T,F
78,4777,(223)392-2765,PHONE_NUMBER,"(2237, 2250)",T,F


In [15]:
0.66/0.83

0.7951807228915664

In [24]:
import pandas as pd
import ast

# Load entities from 'output/pii_detected_trf_filtered.txt'
detected_entities = []
with open('output/pii_detected_trf_filtered.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        detected_entities.append(entity)

# Load true entities from 'data/pii_entities.txt'
true_entities = []
with open('data/pii_entities.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        true_entities.append((entity[0], entity[1], entity[3]))

# Convert true entities to a set for fast lookup
true_entities_set = set(true_entities)

# Prepare data for the DataFrame
data = []
for entity in detected_entities:
    file_idx, entity_text, entity_type, positions = entity
    true_label = 'T' if (file_idx, entity_text, positions) in true_entities_set else 'F'
    data.append((file_idx, entity_text, entity_type, positions, true_label))

# Create the DataFrame
df = pd.DataFrame(data, columns=['file_idx', 'entity_text', 'type', 'positions', 'true_label'])

# Save DataFrame to a CSV file
df.to_csv('output/pii_detected_trf_filtered.csv', index=False)

# Display the DataFrame
# print(df)
df

Unnamed: 0,file_idx,entity_text,type,positions,true_label
0,0,Angela Meyer,PERSON,"(1039, 1051)",F
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)",F
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F
3,7,Nathalie Sylla,PERSON,"(52, 66)",T
4,7,Buzan T.,PERSON,"(263, 271)",F
...,...,...,...,...,...
16668,22670,Carmen Garcia,PERSON,"(781, 794)",T
16669,22676,https://www.coursera.org/lecture/uva-darden-de...,URL,"(1353, 1437)",F
16670,22676,Buddha,PERSON,"(1222, 1228)",F
16671,22678,JOURNEY MAP,PERSON,"(10, 21)",F


In [30]:
# Function to extract the sentence that the PII appears in
def extract_sentence(text, s, e):
    # Find the start of the sentence
    sentence_start = s
    while sentence_start > 0 and text[sentence_start - 1] not in '.!?':
        sentence_start -= 1
    
    # Find the end of the sentence
    sentence_end = e
    while sentence_end < len(text) and text[sentence_end] not in '.!?':
        sentence_end += 1
    
    # Extract and return the sentence
    return text[sentence_start:sentence_end + 1].strip()

# Function to add the sentence column to the DataFrame
def add_sentence_column(df):
    # Apply the extract_sentence function to each row in the DataFrame
    df['sentence'] = df.apply(lambda row: extract_sentence(data.iloc[row['file_idx']].full_text, row['positions'][0], row['positions'][1]), axis=1)
    return df

# Add the sentence column to the DataFrame
df = add_sentence_column(df)

# Save DataFrame to the same CSV file
df.to_csv('output/pii_detected_trf_filtered.csv', index=False)
df

Unnamed: 0,file_idx,entity_text,type,positions,true_label,sentence
0,0,Angela Meyer,PERSON,"(1039, 1051)",F,Not only imagine things but visualization is ...
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)",F,1 https://www.santander.com/content/dam/santa...
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F,2 https://www.greatplacetowork.com/resources/b...
3,7,Nathalie Sylla,PERSON,"(52, 66)",T,Design Thinking for innovation reflexion-Avril...
4,7,Buzan T.,PERSON,"(263, 271)",F,According to the definition of Buzan T. and Bu...
...,...,...,...,...,...,...
16668,22670,Carmen Garcia,PERSON,"(781, 794)",T,can help in developing\n\nInterviewer Name :C...
16669,22676,https://www.coursera.org/lecture/uva-darden-de...,URL,"(1353, 1437)",F,(https://www.coursera.org/lecture/uva-darden-d...
16670,22676,Buddha,PERSON,"(1222, 1228)",F,Even Buddha is seen sharing stories about his ...
16671,22678,JOURNEY MAP,PERSON,"(10, 21)",F,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...


In [28]:
# Check if all 'entity_text' values are in their corresponding 'sentence'
df.apply(lambda row: row['entity_text'] in row['sentence'], axis=1).all()

True

In [5]:
import pandas as pd
import ast

# Load entities from 'data/pii_entities.txt'
true_entities = []
with open('data/pii_entities.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        true_entities.append(entity)

# Convert entities into a DataFrame
df_true_entities = pd.DataFrame(true_entities, columns=['file_idx', 'entity_text', 'type', 'positions'])

# Display the DataFrame
df_true_entities.to_csv('data/pii_true_entities.csv', index=False)
df_true_entities

Unnamed: 0,file_idx,entity_text,type,positions
0,4,Henry Acosta,NAME_STUDENT,"(36, 48)"
1,7,Nathalie Sylla,NAME_STUDENT,"(52, 66)"
2,7,Nathalie Sylla,NAME_STUDENT,"(2281, 2295)"
3,7,Nathalie Sylla,NAME_STUDENT,"(3648, 3662)"
4,8,Vanesa Chan,NAME_STUDENT,"(83, 94)"
...,...,...,...,...
4866,22592,Diego Castro,NAME_STUDENT,"(4463, 4475)"
4867,22635,Anthony Abubakar,NAME_STUDENT,"(101, 117)"
4868,22660,Amina Koko,NAME_STUDENT,"(32, 42)"
4869,22660,Amina Koko,NAME_STUDENT,"(3697, 3707)"
