In [69]:
import pandas as pd

def get_undetected_entities(ground_truth_df, detected_data, azure_mapping, presidio_mapping):
    """
    Find PII entities in the ground truth that are not detected by any of the models.
    
    Parameters:
    - ground_truth_df (pd.DataFrame): The dataframe with true PII entities.
    - detected_data (dict of pd.DataFrame): Dictionary with filenames as keys and detected dataframes as values.
    - azure_mapping (dict): Mapping for Azure-detected entity types to ground truth types.
    - presidio_mapping (dict): Mapping for Presidio-detected entity types to ground truth types.
    
    Returns:
    - pd.DataFrame: Dataframe of true PII entities not detected by any model.
    """
    # Apply type mappings to detected dataframes based on the source
    for filename, detected_df in detected_data.items():
        if 'azure' in filename.lower():  # Apply Azure mapping
            detected_df['type'] = detected_df['type'].map(azure_mapping).fillna(detected_df['type'])
        elif 'lg' in filename.lower() or 'trf' in filename.lower():  # Apply Presidio mapping
            detected_df['type'] = detected_df['type'].map(presidio_mapping).fillna(detected_df['type'])
        # Other models need no mapping adjustments

    # Combine all detected entities across models
    all_detected = pd.concat(detected_data.values())
    all_detected = all_detected.drop_duplicates(subset=['file_idx', 'entity_text', 'type', 'positions'])
    all_detected = all_detected.drop(columns=['positions'])
    ground_truth_df = ground_truth_df.drop(columns=['positions'])

    # Perform anti-join to find entities in ground truth not detected by any model
    undetected_entities = ground_truth_df.merge(
        all_detected, 
        # on=['file_idx', 'entity_text', 'type', 'positions'], 
        on=['file_idx', 'entity_text', 'type'], 
        how='left', 
        indicator=True
    )
    undetected_entities = undetected_entities[undetected_entities['_merge'] == 'left_only'].drop(columns=['_merge'])

    return undetected_entities

# Load the dataframes and store them in a dictionary
ground_truth = pd.read_csv('data/test_set_2.csv')

# Dictionary of detected dataframes with filenames as keys
detected_data = {
    'pii_pre_lg_detected_2.csv': pd.read_csv('output/pii_pre_lg_detected_2.csv'),
    'pii_detected_trf_2.csv': pd.read_csv('output/pii_detected_trf_2.csv'),
    'pii_azure_detected.csv': pd.read_csv('output/pii_azure_detected.csv'),
    'pii_pt_detected_2.csv': pd.read_csv('output/pii_pt_detected_2.csv'),
    'pii_ft_detected_2.csv': pd.read_csv('output/pii_ft_detected_2.csv'),
    'pii_ft_detected_ncot.csv': pd.read_csv('output/pii_ft_detected_ncot.csv'),
    'pii_ft_detected_cot1.csv': pd.read_csv('output/pii_ft_detected_cot1.csv'),
    'pii_ft_detected_cot2.csv': pd.read_csv('output/pii_ft_detected_cot2.csv')
}

# Define mappings
azure_mapping = {
    "Person": "NAME_STUDENT",
    "Email": "EMAIL",
    "URL": "URL_PERSONAL",
    "PhoneNumber": "PHONE_NUM"
}
presidio_mapping = {
    "PERSON": "NAME_STUDENT",
    "EMAIL_ADDRESS": "EMAIL",
    "URL": "URL_PERSONAL",
    "PHONE_NUMBER": "PHONE_NUM"
}

# Get undetected entities
undetected_entities = get_undetected_entities(
    ground_truth,
    detected_data,
    azure_mapping,
    presidio_mapping
)

# Save or inspect the results
# undetected_entities.to_csv('output/undetected_entities.csv', index=False)
print(undetected_entities.shape)
undetected_entities

(18, 3)


Unnamed: 0,file_idx,entity_text,type
479,3701,Newton,NAME_STUDENT
480,3701,Newton,NAME_STUDENT
674,4553,Jessie Belal https://www.kramer.info/wp-conten...,NAME_STUDENT
711,4820,Moin Ch,NAME_STUDENT
1492,6849,Madison Tate 034626995785,NAME_STUDENT
2456,10269,Sergio Echavarria,NAME_STUDENT
2610,10950,Matt Riley 201375864478,NAME_STUDENT
4201,12640,https://youtu.be/uDRN9IA2T5,URL_PERSONAL
4543,13735,No Bernardo,NAME_STUDENT
4544,13735,No Bernardo,NAME_STUDENT


In [34]:
df_ft = pd.read_csv('output/pii_ft_detected_2.csv')

In [None]:
df_ft

In [9]:
def read_json(path = 'data/obfuscated_data_06.json'):
    df = pd.read_json(path, orient="records",encoding='utf-8')
    return df

df = read_json()

In [88]:
print(df.iloc[10950].tokens[215:219])
print(df.iloc[10950].labels[215:219])
print(df.iloc[10950].full_text[1074:1084])

['Matt', 'Riley', '201375864478', 'Cs-03']
['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'O']
Matt Riley


In [137]:
print(df.iloc[15904].tokens[1576:])
print(df.iloc[15904].labels[1576:])
print(df.iloc[15904].full_text[8432:8531])
print(df.iloc[15904].full_text[8432:8447])
print(df.iloc[15904].full_text[8448:8474])
print(df.iloc[15904].full_text[8475:8496])
print(df.iloc[15904].full_text[8497:8531])

['Viviane', 'Peeters', 'benjamintaylor@hotmail.com', '001', '-', '459', '-', '970', '-', '5605x7709', 'https://www.linkedin.com/in/fstone', '\n\n']
['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-EMAIL', 'B-PHONE_NUM', 'I-PHONE_NUM', 'I-PHONE_NUM', 'I-PHONE_NUM', 'I-PHONE_NUM', 'I-PHONE_NUM', 'I-PHONE_NUM', 'B-URL_PERSONAL', 'O']
Viviane Peeters benjamintaylor@hotmail.com 001-459-970-5605x7709 https://www.linkedin.com/in/fstone
Viviane Peeters
benjamintaylor@hotmail.com
001-459-970-5605x7709
https://www.linkedin.com/in/fstone


In [41]:
# df.iloc[19254].labels
print(df.iloc[6849].tokens[14:18])
print(df.iloc[6849].labels[14:18])

['Madison', 'Tate', '034626995785', '\n\n']
['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'O']


In [42]:
df_truth = pd.read_csv('data/test_set_2.csv')

In [None]:
df_truth

In [138]:
print(df.iloc[4553].tokens[21:25])
print(df.iloc[4553].labels[21:25])
print(df.iloc[4553].full_text[119:131])
print(df.iloc[4553].full_text[132:188])

['Jessie', 'Belal', 'https://www.kramer.info/wp-content/category/bloghome.php', '\n\n']
['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'O']
Jessie Belal
https://www.kramer.info/wp-content/category/bloghome.php


In [63]:
print(df.iloc[12267].tokens[648:650])
print(df.iloc[12267].labels[648:650])

['https://www.johnson.biz/wp-content/appcategory.php', '>']
['B-URL_PERSONAL', 'O']


In [64]:
df_ft[df_ft['file_idx'] == 12267]

Unnamed: 0,file_idx,entity_text,type,positions
2569,12267,Fazal Magdy,NAME_STUDENT,"(72, 83)"
2570,12267,https://www.johnson.biz/wp-content/appcategory...,URL_PERSONAL,"(3127, 3177)"
2571,12267,https://www.johnson.biz/wp-content/appcategory...,URL_PERSONAL,"(4706, 4756)"


In [70]:
df_ft[df_ft['file_idx'] == 16435]

Unnamed: 0,file_idx,entity_text,type,positions
3564,16435,Maniam Mani,NAME_STUDENT,"(2254, 2265)"
3565,16435,https://smith.org/main/list/tagsprivacy.htm,URL_PERSONAL,"(3585, 3628)"
3566,16435,Maniam Mani,NAME_STUDENT,"(4651, 4662)"
3567,16435,Maniam Mani,NAME_STUDENT,"(5000, 5011)"


In [71]:
df_truth[df_truth['file_idx'] == 16435]

Unnamed: 0,file_idx,entity_text,type,positions
2186,16435,Maniam Mani,NAME_STUDENT,"(2254, 2265)"
2187,16435,Maniam Mani,NAME_STUDENT,"(4651, 4662)"
2188,16435,Maniam Mani,NAME_STUDENT,"(5000, 5011)"
2189,16435,madehttps://smith.org/main/list/tagsprivacy.htm,URL_PERSONAL,"(3581, 3628)"


In [72]:
df.iloc[16435].tokens

['Reflection',
 ':',
 'Story',
 'Telling',
 '\n\n',
 'A',
 'Tool',
 'to',
 'Drive',
 'Social',
 'Change',
 '\n\n',
 'Challenge',
 ':',
 '\n\n',
 'I',
 'work',
 'with',
 'companies',
 'in',
 'Saudi',
 'Arabia',
 'which',
 'have',
 'an',
 'interest',
 'in',
 'driving',
 'social',
 'change',
 'through',
 ' ',
 'their',
 'social',
 'responsibility',
 'departments',
 '.',
 'Such',
 'sector',
 'in',
 'Saudi',
 'is',
 'considered',
 'new',
 'and',
 'is',
 'slowly',
 ' ',
 'complementing',
 'the',
 'work',
 'of',
 'marketing',
 'departments',
 '.',
 'The',
 'main',
 'challenge',
 'was',
 'complex',
 'as',
 'many',
 ' ',
 'of',
 'our',
 'clients',
 'were',
 'either',
 'misunderstanding',
 'social',
 'responsibility',
 'for',
 'charity',
 'or',
 'ﬁguring',
 'their',
 'way',
 ' ',
 'through',
 'it',
 'as',
 'it',
 'is',
 'a',
 'new',
 'ﬁeld',
 ',',
 'trying',
 'to',
 'convince',
 'their',
 'managers',
 'to',
 'spend',
 'more',
 'budget',
 'on',
 'it',
 '.',
 '\n\n',
 'For',
 'the',
 'uneducated',

In [139]:
import pandas as pd

def get_undetected_entities(ground_truth_df, detected_data, azure_mapping, presidio_mapping):
    """
    Find PII entities in the ground truth that are not detected by any of the models.
    
    Parameters:
    - ground_truth_df (pd.DataFrame): The dataframe with true PII entities.
    - detected_data (dict of pd.DataFrame): Dictionary with filenames as keys and detected dataframes as values.
    - azure_mapping (dict): Mapping for Azure-detected entity types to ground truth types.
    - presidio_mapping (dict): Mapping for Presidio-detected entity types to ground truth types.
    
    Returns:
    - pd.DataFrame: Dataframe of true PII entities not detected by any model.
    """
    # Apply type mappings to detected dataframes based on the source
    for filename, detected_df in detected_data.items():
        if 'azure' in filename.lower():  # Apply Azure mapping
            detected_df['type'] = detected_df['type'].map(azure_mapping).fillna(detected_df['type'])
        elif 'lg' in filename.lower() or 'trf' in filename.lower():  # Apply Presidio mapping
            detected_df['type'] = detected_df['type'].map(presidio_mapping).fillna(detected_df['type'])
        # Other models need no mapping adjustments

    # Combine all detected entities across models
    all_detected = pd.concat(detected_data.values())
    all_detected = all_detected.drop_duplicates(subset=['file_idx', 'entity_text', 'type', 'positions'])

    # Perform anti-join to find entities in ground truth not detected by any model
    undetected_entities = ground_truth_df.merge(
        all_detected, 
        on=['file_idx', 'entity_text', 'type', 'positions'], 
        how='left', 
        indicator=True
    )
    undetected_entities = undetected_entities[undetected_entities['_merge'] == 'left_only'].drop(columns=['_merge'])

    return undetected_entities

# Load the dataframes and store them in a dictionary
ground_truth = pd.read_csv('data/test_set_2.csv')

# Dictionary of detected dataframes with filenames as keys
detected_data = {
    # 'pii_pre_lg_detected_2.csv': pd.read_csv('output/pii_pre_lg_detected_2.csv'),
    # 'pii_detected_trf_2.csv': pd.read_csv('output/pii_detected_trf_2.csv'),
    # 'pii_azure_detected.csv': pd.read_csv('output/pii_azure_detected.csv'),
    # 'pii_pt_detected_2.csv': pd.read_csv('output/pii_pt_detected_2.csv'),
    'pii_ft_detected_2.csv': pd.read_csv('output/pii_ft_detected_2.csv'),
    # 'pii_ft_detected_ncot.csv': pd.read_csv('output/pii_ft_detected_ncot.csv'),
    # 'pii_ft_detected_cot1.csv': pd.read_csv('output/pii_ft_detected_cot1.csv'),
    # 'pii_ft_detected_cot2.csv': pd.read_csv('output/pii_ft_detected_cot2.csv')
}

# Define mappings
azure_mapping = {
    "Person": "NAME_STUDENT",
    "Email": "EMAIL",
    "URL": "URL_PERSONAL",
    "PhoneNumber": "PHONE_NUM"
}
presidio_mapping = {
    "PERSON": "NAME_STUDENT",
    "EMAIL_ADDRESS": "EMAIL",
    "URL": "URL_PERSONAL",
    "PHONE_NUMBER": "PHONE_NUM"
}

# Get undetected entities
undetected_entities = get_undetected_entities(
    ground_truth,
    detected_data,
    azure_mapping,
    presidio_mapping
)

# Save or inspect the results
# undetected_entities.to_csv('output/undetected_entities.csv', index=False)
print(undetected_entities.shape)
undetected_entities

(162, 4)


Unnamed: 0,file_idx,entity_text,type,positions
38,472,https://youtu.be/rFD2lJuvace,URL_PERSONAL,"(4886, 4914)"
40,589,https://www.wagner.net/categoriesmain.jsp,URL_PERSONAL,"(2863, 2904)"
62,761,https://www.youtube.com/watch?v=Kx0SXy87bVZ,URL_PERSONAL,"(1186, 1229)"
118,1798,http://www.burns-lopez.com/categories/appabout...,URL_PERSONAL,"(1611, 1661)"
177,3241,Rodriguez,NAME_STUDENT,"(3156, 3165)"
...,...,...,...,...
2768,20507,Anand Patel,NAME_STUDENT,"(5694, 5705)"
2812,21179,Ana Medina,NAME_STUDENT,"(5330, 5340)"
2833,21321,Omar Iqbal,NAME_STUDENT,"(1656, 1666)"
2849,21844,Mg Maurel,NAME_STUDENT,"(0, 9)"


In [143]:
pre_lg = pd.read_csv('output/pii_pre_lg_detected_2.csv')
pre_trf = pd.read_csv('output/pii_detected_trf_2.csv')
azure = pd.read_csv('output/pii_azure_detected.csv')
pt = pd.read_csv('output/pii_pt_detected_2.csv')
ft = pd.read_csv('output/pii_ft_detected_2.csv')
ncot = pd.read_csv('output/pii_ft_detected_ncot.csv')
cot1 = pd.read_csv('output/pii_ft_detected_cot1.csv')
cot2 = pd.read_csv('output/pii_ft_detected_cot2.csv')
dataset_list = [pre_lg, pre_trf, azure, pt, ft, ncot, cot1, cot2]

In [174]:
def check_entity(entity, dataset_list):
    res_lst = []
    for i, dataset in enumerate(dataset_list):
        if entity in dataset['entity_text'].values:
            res_lst.append(i)
    return res_lst

i = check_entity('Soka', dataset_list)
print(i)

[0, 1, 2]


In [176]:
i = check_entity('Jordi', dataset_list)
print(i)
# pre_trf, azure, ft

[0, 1, 2]
