In [9]:
import pandas as pd
import random

df_true = pd.read_csv('data/pii_true_entities.csv')

# Set the seed for reproducibility
random.seed(46)

# Step 1: Define the total range of indices
total_indices = list(range(22688))

# Step 2: Randomly sample 25% of the elements for training indices
train_indices = random.sample(total_indices, int(len(total_indices) * 0.25))

# Step 3: Determine the remaining elements for testing indices
test_indices = list(set(total_indices) - set(train_indices))

# Optionally, sort the indices to maintain order
train_indices = sorted(train_indices)
test_indices = sorted(test_indices)

# Display the results
print(f"Total number of train indices: {len(train_indices)}")
print(train_indices)
print(f"Total number of test indices: {len(test_indices)}")
print(test_indices)


def count_entities(df, file_indices):
    """
    Counts the number of true entities for each category and overall entities in the given file indices.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the entity information.
        file_indices (list): A list of file indices to consider for counting.
    
    Returns:
        dict: A dictionary containing the counts for each entity type and the total count.
    """
    # Filter the DataFrame for the given file indices
    df_filtered = df[df['file_idx'].isin(file_indices)]
    
    # Group by type and count the number of occurrences
    entity_counts = df_filtered['type'].value_counts().to_dict()

    # Calculate the total number of entities
    total_entities = sum(entity_counts.values())
    
    # Add the total count to the dictionary
    entity_counts['TOTAL'] = total_entities
    
    return entity_counts

# Calculate the number of true entities for each category and overall for train files
train_entity_counts = count_entities(df_true, train_indices)

# Calculate the number of true entities for each category and overall for test files
test_entity_counts = count_entities(df_true, test_indices)

# Display the results
print("Entity counts in train files:", train_entity_counts)
print("Entity counts in test files:", test_entity_counts)

# Save df_true_train and df_true_test to CSV
df_true_train = df_true[df_true['file_idx'].isin(train_indices)]
df_true_test = df_true[df_true['file_idx'].isin(test_indices)]

df_true_train.to_csv('data/train_set.csv', index=False)
df_true_test.to_csv('data/test_set.csv', index=False)

print("Train and test DataFrames have been saved to 'train_set.csv' and 'test_set.csv'.")


Total number of train indices: 5672
[0, 20, 33, 36, 38, 46, 50, 51, 54, 55, 59, 60, 72, 77, 81, 86, 92, 98, 100, 101, 102, 104, 109, 116, 117, 118, 121, 127, 133, 134, 135, 137, 139, 141, 146, 151, 153, 155, 161, 162, 169, 170, 178, 181, 182, 184, 205, 206, 229, 230, 235, 236, 240, 244, 248, 249, 252, 259, 265, 267, 272, 284, 286, 296, 300, 301, 308, 309, 312, 316, 319, 323, 324, 338, 347, 349, 351, 353, 354, 355, 358, 359, 366, 372, 373, 374, 376, 379, 380, 384, 401, 413, 420, 424, 430, 436, 439, 446, 452, 457, 459, 460, 466, 467, 471, 473, 476, 480, 482, 494, 499, 502, 507, 514, 521, 524, 526, 528, 529, 530, 532, 533, 537, 540, 543, 550, 559, 561, 563, 567, 570, 571, 584, 593, 598, 601, 602, 603, 608, 609, 615, 619, 626, 627, 636, 640, 645, 649, 651, 656, 658, 665, 668, 674, 679, 683, 684, 685, 686, 690, 693, 697, 701, 702, 704, 705, 707, 709, 722, 727, 732, 746, 758, 759, 762, 765, 766, 767, 771, 784, 791, 794, 795, 799, 802, 809, 812, 818, 821, 830, 835, 836, 843, 846, 855, 856, 85

In [944]:
# Assuming df_true_train is already defined
# For example, load your data with:
# df_true_train = pd.read_csv('path_to_train_data.csv')

# Initialize an empty list to store the sampled file indices
sampled_file_indices = []

# Set a seed for reproducibility
random.seed(46)

# Group by 'type' and sample 1/4 of the file indices for each category
for entity_type, group in df_true_train.groupby('type'):
    # Get unique file indices for the current type
    unique_file_indices = group['file_idx'].unique().tolist()
    
    # Calculate the number of samples as 1/4 of the unique indices
    num_samples = max(1, int(len(unique_file_indices) / 4))
    
    # Randomly sample 1/4 of the file indices
    sampled_indices = random.sample(unique_file_indices, num_samples)
    
    # Extend the sampled file indices to the main list
    sampled_file_indices.extend(sampled_indices)

sampled_file_indices = sorted(sampled_file_indices)
# Display the sampled file indices
print(f"Sampled file indices (1/4 for each category): {sampled_file_indices}")
print(f"Total number of sampled file indices: {len(sampled_file_indices)}")

print(count_entities(df_true, sampled_file_indices))

Sampled file indices (1/4 for each category): [86, 109, 609, 651, 1210, 1239, 1437, 1478, 1805, 1812, 2707, 2897, 2897, 3285, 3538, 3540, 3709, 3709, 3709, 3883, 3917, 3935, 4026, 4212, 4316, 4469, 4518, 4600, 4774, 4777, 4968, 5138, 5155, 5203, 5236, 5258, 5320, 5342, 5354, 5358, 5641, 5742, 5811, 6061, 6109, 6133, 6161, 6209, 6257, 6285, 6458, 6465, 6888, 6921, 6954, 7098, 7193, 7197, 7202, 7327, 7581, 7624, 7730, 7779, 7866, 8088, 8233, 8329, 8372, 8496, 8583, 8587, 8731, 8735, 9146, 9150, 9371, 9535, 9540, 9628, 9653, 9727, 9800, 9832, 10098, 10165, 10213, 10281, 10380, 10544, 10724, 10739, 10825, 11077, 11095, 11116, 11196, 11208, 11261, 11343, 11622, 11672, 11674, 11706, 11827, 11879, 11893, 12061, 12161, 12179, 12199, 12296, 12316, 12321, 12444, 12462, 12470, 12488, 12532, 12594, 12658, 12788, 12828, 12843, 12895, 12898, 12999, 13158, 13332, 13467, 13569, 13713, 14101, 14128, 14170, 14282, 14396, 14719, 14757, 14885, 14903, 14982, 15046, 15050, 15091, 15390, 15471, 15667, 15745,

In [954]:
import pandas as pd

# Load the detected entities from Presidio
df_detected = pd.read_csv('output/pii_detected_trf_filtered.csv')

# Filter the DataFrame to keep only entities where file_idx is in the test files
df_detected_test = df_detected[df_detected['file_idx'].isin(test_indices)]

# Save the filtered DataFrame to a new CSV file
df_detected_test.to_csv('output/pii_detected_trf_test.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_trf_test.csv'.")
df_detected_test


Filtered detected entities have been saved to 'output/pii_detected_trf_test.csv'.


Unnamed: 0,file_idx,entity_text,type,positions,true_label,sentence,context
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)",F,1 https://www.santander.com/content/dam/santa...,I introduce different consulting groups that ...
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F,2 https://www.greatplacetowork.com/resources/b...,"Thanks to this course, I learned new tools for..."
3,7,Nathalie Sylla,PERSON,"(52, 66)",T,Design Thinking for innovation reflexion-Avril...,Design Thinking for innovation reflexion-Avril...
4,7,Buzan T.,PERSON,"(263, 271)",F,According to the definition of Buzan T. and Bu...,What exactly is a mind map? According to the d...
5,7,Buzan B.,PERSON,"(276, 284)",F,"and Buzan B. (1999, Dessine-moi l'intelligence.",According to the definition of Buzan T. and Bu...
...,...,...,...,...,...,...,...
16666,22661,Jake Knapp,PERSON,"(501, 511)",F,The first tool I used was the tool of Visualiz...,). The first tool I used was the tool of Visua...
16667,22664,Andre Martin,PERSON,"(1539, 1551)",F,I have particularly drawn from the approach of...,And provide them an opportunity to think thro...
16668,22670,Carmen Garcia,PERSON,"(781, 794)",T,can help in developing\n\nInterviewer Name :C...,Where are\n\nfuture? can help in developing\n...
16671,22678,JOURNEY MAP,PERSON,"(10, 21)",F,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...


In [955]:
# Filter the DataFrame to keep only entities where file_idx is in the train files
df_detected_train = df_detected[df_detected['file_idx'].isin(train_indices)]

# Save the filtered DataFrame to a new CSV file
df_detected_train.to_csv('output/pii_detected_trf_train.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_trf_train.csv'.")
df_detected_train


Filtered detected entities have been saved to 'output/pii_detected_trf_train.csv'.


Unnamed: 0,file_idx,entity_text,type,positions,true_label,sentence,context
0,0,Angela Meyer,PERSON,"(1039, 1051)",F,Not only imagine things but visualization is ...,For this to happen we need to have good visua...
15,20,Sindy Samaca,PERSON,"(32, 44)",T,Design Thinking for Innovation\n\nSindy Samaca...,Design Thinking for Innovation\n\nSindy Samaca...
16,20,George,PERSON,"(1606, 1612)",F,This tool is useful for me to have a clearer v...,Selection\n\nI decided to choose the visualiza...
17,20,Geoff,PERSON,"(1617, 1622)",F,This tool is useful for me to have a clearer v...,Selection\n\nI decided to choose the visualiza...
22,33,Chepe Scott,PERSON,"(2385, 2396)",F,"When I watch the video, at the end of week 2 i...",We a story involving a family building their n...
...,...,...,...,...,...,...,...
16629,22601,Storytelling\n\nChallenge,PERSON,"(0, 23)",F,Storytelling\n\nChallenge\n\nMy project is bas...,Storytelling\n\nChallenge\n\nMy project is bas...
16646,22627,Theodore Levitt,PERSON,"(4157, 4172)",F,The reason for mapping customers’ experiences ...,Customer mapping accompanied with visualizatio...
16647,22628,https://www.nngroup.com/articles/ideation-in-p...,URL,"(1137, 1191)",F,I chose the methodology based on this post mad...,"Application We got together one day, with s..."
16669,22676,https://www.coursera.org/lecture/uva-darden-de...,URL,"(1353, 1437)",F,(https://www.coursera.org/lecture/uva-darden-d...,Following is the link of the video from which ...


In [929]:
# Write the sorted indices list to the output file
with open('data/train_indices.txt', 'w') as f:
    f.write(str(train_indices))

with open('data/test_indices.txt', 'w') as f:
    f.write(str(test_indices))

print(f"Sorted training and testing file indices list saved to respective locations.")

Sorted training and testing file indices list saved to respective locations.


In [None]:
# All true entites summary:
# NAME_STUDENT: 4394	——	1099, 3295
# URL_PERSONAL: 352		——	88, 264
# EMAIL: 111			——	28, 83
# PHONE_NUM: 14			——	4, 10

In [957]:
import ast
# Load entities from 'output/pii_detected_lg.txt'
entity_lst = []
with open('output/pii_detected_lg.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        entity_lst.append(entity)

# Prepare data for the DataFrame
# data = []
# for entity in entities_with_results:
#     file_idx, entity_text, entity_type, positions, gpt4omini_label = entity
#     true_label = 'T' if (file_idx, entity_text, positions) in true_entities_set else 'F'
#     data.append((file_idx, entity_text, entity_type, positions, true_label, gpt4omini_label))

# Create the DataFrame
df_lg = pd.DataFrame(entity_lst, columns=['file_idx', 'entity_text', 'type', 'positions'])

# Save DataFrame to a CSV file
df_lg.to_csv('output/pii_detected_lg.csv', index=False)

# Display the DataFrame
# print(df)
df_lg

Unnamed: 0,file_idx,entity_text,type,positions
0,0,Angela Meyer,PERSON,"(1039, 1051)"
1,2,VISUALIZATION,PERSON,"(0, 13)"
2,4,Henry Acosta,PERSON,"(36, 48)"
3,5,https://www.greatplacetowork.com/resources/blo...,PERSON,"(4150, 4251)"
4,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)"
...,...,...,...,...
23089,22682,635.Th,URL,"(202, 208)"
23090,22682,challenge.Am,URL,"(335, 347)"
23091,22682,ideas.An,URL,"(419, 427)"
23092,22682,selected.Th,URL,"(574, 585)"


In [958]:
import pandas as pd

# Load the detected entities from Presidio
# df_detected = pd.read_csv('output/pii_detected_trf_filtered.csv')

# Filter the DataFrame to keep only entities where file_idx is in the test files
df_lg_test = df_lg[df_lg['file_idx'].isin(test_indices)]

# Save the filtered DataFrame to a new CSV file
df_lg_test.to_csv('output/pii_detected_lg_test.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_trf_test.csv'.")
df_lg_test


Filtered detected entities have been saved to 'output/pii_detected_trf_test.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
1,2,VISUALIZATION,PERSON,"(0, 13)"
2,4,Henry Acosta,PERSON,"(36, 48)"
3,5,https://www.greatplacetowork.com/resources/blo...,PERSON,"(4150, 4251)"
4,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)"
5,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)"
...,...,...,...,...
23089,22682,635.Th,URL,"(202, 208)"
23090,22682,challenge.Am,URL,"(335, 347)"
23091,22682,ideas.An,URL,"(419, 427)"
23092,22682,selected.Th,URL,"(574, 585)"


In [35]:
# Further split the test set to 10% train and 90% test
import pandas as pd
import random

df_true = pd.read_csv('data/pii_true_entities.csv')

# Set the seed for reproducibility 5
random.seed(5)

# Step 1: Define the total range of indices
# total_indices = list(range(22688))

# Step 2: Randomly sample 20% of the elements for training indices
train_indices_2 = random.sample(test_indices, int(len(test_indices) * 0.2))

# Step 3: Determine the remaining elements for testing indices
test_indices_2 = list(set(test_indices) - set(train_indices_2))

# Optionally, sort the indices to maintain order
train_indices_2 = sorted(train_indices_2)
test_indices_2 = sorted(test_indices_2)

# Display the results
print(f"Total number of train indices 2: {len(train_indices_2)}")
print(train_indices_2)
print(f"Total number of test indices 2: {len(test_indices_2)}")
print(test_indices_2)


def count_entities(df, file_indices):
    """
    Counts the number of true entities for each category and overall entities in the given file indices.
    
    Args:
        df (pd.DataFrame): The DataFrame containing the entity information.
        file_indices (list): A list of file indices to consider for counting.
    
    Returns:
        dict: A dictionary containing the counts for each entity type and the total count.
    """
    # Filter the DataFrame for the given file indices
    df_filtered = df[df['file_idx'].isin(file_indices)]
    
    # Group by type and count the number of occurrences
    entity_counts = df_filtered['type'].value_counts().to_dict()

    # Calculate the total number of entities
    total_entities = sum(entity_counts.values())
    
    # Add the total count to the dictionary
    entity_counts['TOTAL'] = total_entities
    
    return entity_counts

# Calculate the number of true entities for each category and overall for train files
train_entity_counts_2 = count_entities(df_true, train_indices_2)

# Calculate the number of true entities for each category and overall for test files
test_entity_counts_2 = count_entities(df_true, test_indices_2)

# Display the results
print("Entity counts in train files 2:", train_entity_counts_2)
print("Entity counts in test files 2:", test_entity_counts_2)

# Save df_true_train and df_true_test to CSV
df_true_train_2 = df_true[df_true['file_idx'].isin(train_indices_2)]
df_true_test_2 = df_true[df_true['file_idx'].isin(test_indices_2)]

df_true_train_2.to_csv('data/train_set_2.csv', index=False)
df_true_test_2.to_csv('data/test_set_2.csv', index=False)

print("Train and test DataFrames have been saved to 'train_set_2.csv' and 'test_set_2.csv'.")


Total number of train indices 2: 3403
[1, 3, 4, 8, 15, 23, 29, 34, 39, 40, 44, 48, 49, 61, 63, 64, 70, 74, 76, 82, 87, 93, 95, 108, 110, 115, 123, 129, 136, 140, 144, 152, 165, 174, 215, 219, 222, 231, 234, 246, 253, 256, 263, 269, 277, 281, 283, 288, 292, 294, 306, 310, 333, 335, 340, 368, 371, 375, 381, 387, 389, 395, 396, 402, 407, 415, 418, 423, 426, 428, 440, 450, 462, 469, 470, 474, 484, 487, 488, 490, 503, 504, 512, 516, 518, 522, 531, 535, 555, 558, 569, 581, 583, 594, 600, 610, 611, 613, 614, 618, 624, 632, 638, 644, 648, 669, 671, 688, 691, 699, 706, 712, 718, 723, 724, 728, 741, 745, 750, 753, 768, 770, 785, 793, 796, 810, 815, 817, 822, 834, 839, 854, 861, 862, 863, 876, 887, 888, 889, 898, 909, 913, 917, 918, 927, 933, 935, 939, 943, 947, 950, 962, 964, 977, 1005, 1016, 1020, 1021, 1026, 1040, 1045, 1052, 1064, 1065, 1068, 1071, 1075, 1097, 1104, 1115, 1117, 1119, 1122, 1127, 1129, 1132, 1141, 1143, 1145, 1155, 1163, 1167, 1174, 1176, 1179, 1180, 1187, 1199, 1202, 1207, 12

In [36]:
# Write the sorted indices list to the output file
with open('data/train_indices_2.txt', 'w') as f:
    f.write(str(train_indices_2))

with open('data/test_indices_2.txt', 'w') as f:
    f.write(str(test_indices_2))

print(f"Sorted training and testing file indices list saved to respective locations.")

Sorted training and testing file indices list saved to respective locations.


In [39]:
# Load the detected entities
df_detected_ft = pd.read_csv('output/pii_ft_detected.csv')

# Filter the DataFrame to keep only entities where file_idx is in the test files
df_detected_ft_test = df_detected_ft[df_detected_ft['file_idx'].isin(test_indices_2)]

# Save the filtered DataFrame to a new CSV file
df_detected_ft_test.to_csv('output/pii_detected_ft_test.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_ft_test.csv'.")
df_detected_ft_test


Filtered detected entities have been saved to 'output/pii_detected_ft_test.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
1,7,Nathalie Sylla,NAME_STUDENT,"(52, 66)"
2,7,Nathalie Sylla,NAME_STUDENT,"(2281, 2295)"
3,7,Nathalie Sylla,NAME_STUDENT,"(3648, 3662)"
5,10,Diego Estrada,NAME_STUDENT,"(0, 13)"
6,10,Diego Estrada,NAME_STUDENT,"(2386, 2399)"
...,...,...,...,...
5787,22592,Diego Castro,NAME_STUDENT,"(4463, 4475)"
5788,22593,Bob,NAME_STUDENT,"(1872, 1875)"
5789,22593,Mark,NAME_STUDENT,"(1985, 1989)"
5791,22660,Amina Koko,NAME_STUDENT,"(32, 42)"


In [40]:
# Filter the DataFrame to keep only entities where file_idx is in the train files
df_detected_ft_train = df_detected_ft[df_detected_ft['file_idx'].isin(train_indices_2)]

# Save the filtered DataFrame to a new CSV file
df_detected_ft_train.to_csv('output/pii_detected_ft_train.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_ft_train.csv'.")
df_detected_ft_train


Filtered detected entities have been saved to 'output/pii_detected_ft_train.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
0,4,Henry Acosta,NAME_STUDENT,"(36, 48)"
4,8,Vanesa Chan,NAME_STUDENT,"(83, 94)"
13,70,Jennifer,NAME_STUDENT,"(4230, 4238)"
27,93,Silvia Villalobos,NAME_STUDENT,"(0, 17)"
31,123,Stefano Lovato,NAME_STUDENT,"(156, 170)"
...,...,...,...,...
5763,22366,Simi,NAME_STUDENT,"(446, 450)"
5778,22481,Chanllegue,NAME_STUDENT,"(32, 42)"
5790,22635,Anthony Abubakar,NAME_STUDENT,"(101, 117)"
5793,22670,Carmen Garcia,NAME_STUDENT,"(781, 794)"


In [48]:
# Filter Presidio-detected entities (lg) on 60% Test Dataset
df_pre_lg_detected = pd.read_csv('output/pii_detected_lg_test.csv')
df_pre_lg_detected_2 = df_pre_lg_detected[df_pre_lg_detected['file_idx'].isin(test_indices_2)]
df_pre_lg_detected_2.to_csv('output/pii_pre_lg_detected_2.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_pre_lg_detected_2.csv'.")
df_pre_lg_detected_2

Filtered detected entities have been saved to 'output/pii_pre_lg_detected_2.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
0,2,VISUALIZATION,PERSON,"(0, 13)"
2,5,https://www.greatplacetowork.com/resources/blo...,PERSON,"(4150, 4251)"
3,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)"
4,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)"
5,7,Nathalie Sylla\n\n,PERSON,"(52, 68)"
...,...,...,...,...
16996,22682,635.Th,URL,"(202, 208)"
16997,22682,challenge.Am,URL,"(335, 347)"
16998,22682,ideas.An,URL,"(419, 427)"
16999,22682,selected.Th,URL,"(574, 585)"


In [47]:
# Filter Presidio-detected entities (trf) on 60% Test Dataset
df_pre_trf_detected = pd.read_csv('output/pii_detected_trf_test.csv')
df_pre_trf_detected_2 = df_pre_trf_detected[df_pre_trf_detected['file_idx'].isin(test_indices_2)]
df_pre_trf_detected_2.to_csv('output/pii_pre_trf_detected_2.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_pre_trf_detected_2.csv'.")
df_pre_trf_detected_2

Filtered detected entities have been saved to 'output/pii_pre_trf_detected_2.csv'.


Unnamed: 0,file_idx,entity_text,type,positions,true_label,sentence,context
0,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)",F,1 https://www.santander.com/content/dam/santa...,I introduce different consulting groups that ...
1,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F,2 https://www.greatplacetowork.com/resources/b...,"Thanks to this course, I learned new tools for..."
2,7,Nathalie Sylla,PERSON,"(52, 66)",T,Design Thinking for innovation reflexion-Avril...,Design Thinking for innovation reflexion-Avril...
3,7,Buzan T.,PERSON,"(263, 271)",F,According to the definition of Buzan T. and Bu...,What exactly is a mind map? According to the d...
4,7,Buzan B.,PERSON,"(276, 284)",F,"and Buzan B. (1999, Dessine-moi l'intelligence.",According to the definition of Buzan T. and Bu...
...,...,...,...,...,...,...,...
12190,22660,Amina Koko,PERSON,"(2442, 2452)",F,As Amina Koko explain in the last video of the...,I chose this method to get a big range of poss...
12191,22660,Amina Koko,PERSON,"(3697, 3707)",T,Design Thinking MOOC 2017 06 23 Amina Koko App...,They were the best persons to be able to spea...
12192,22661,Jake Knapp,PERSON,"(501, 511)",F,The first tool I used was the tool of Visualiz...,). The first tool I used was the tool of Visua...
12193,22664,Andre Martin,PERSON,"(1539, 1551)",F,I have particularly drawn from the approach of...,And provide them an opportunity to think thro...


In [44]:
# Filter Fine-tune only detected entities on 60% Test Dataset
pii_ft_detected = pd.read_csv('output/pii_ft_detected.csv')
pii_ft_detected_2 = pii_ft_detected[pii_ft_detected['file_idx'].isin(test_indices_2)]
pii_ft_detected_2.to_csv('output/pii_ft_detected_2.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_ft_detected_2.csv'.")
pii_ft_detected_2

Filtered detected entities have been saved to 'output/pii_ft_detected_2.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
1,7,Nathalie Sylla,NAME_STUDENT,"(52, 66)"
2,7,Nathalie Sylla,NAME_STUDENT,"(2281, 2295)"
3,7,Nathalie Sylla,NAME_STUDENT,"(3648, 3662)"
5,10,Diego Estrada,NAME_STUDENT,"(0, 13)"
6,10,Diego Estrada,NAME_STUDENT,"(2386, 2399)"
...,...,...,...,...
5787,22592,Diego Castro,NAME_STUDENT,"(4463, 4475)"
5788,22593,Bob,NAME_STUDENT,"(1872, 1875)"
5789,22593,Mark,NAME_STUDENT,"(1985, 1989)"
5791,22660,Amina Koko,NAME_STUDENT,"(32, 42)"


In [45]:
# Filter Prompting only detected entities on 60% Test Dataset
pii_pt_detected = pd.read_csv('output/pii_pt_detected.csv')
pii_pt_detected_2 = pii_pt_detected[pii_pt_detected['file_idx'].isin(test_indices_2)]
pii_pt_detected_2.to_csv('output/pii_pt_detected_2.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_pt_detected_2.csv'.")
pii_pt_detected_2

Filtered detected entities have been saved to 'output/pii_pt_detected_2.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
1,7,Nathalie Sylla,NAME_STUDENT,"(52, 66)"
2,7,Nathalie Sylla,NAME_STUDENT,"(2281, 2295)"
3,7,Nathalie Sylla,NAME_STUDENT,"(3648, 3662)"
5,10,Diego Estrada,NAME_STUDENT,"(0, 13)"
6,10,Diego Estrada,NAME_STUDENT,"(2386, 2399)"
...,...,...,...,...
4336,22593,Bob,NAME_STUDENT,"(1872, 1875)"
4337,22593,Mark,NAME_STUDENT,"(1985, 1989)"
4339,22660,Amina Koko,NAME_STUDENT,"(32, 42)"
4340,22660,Amina Koko,NAME_STUDENT,"(2443, 2453)"


In [50]:
import ast
entities = []
with open('output/pii_detected_trf.txt', 'r') as file:
    for line in file:
        entity = ast.literal_eval(line.strip())
        entities.append(entity)

# Create the DataFrame
df = pd.DataFrame(entities, columns=['file_idx', 'entity_text', 'type', 'positions'])

# Save DataFrame to a CSV file
df.to_csv('output/pii_detected_trf.csv', index=False)

In [51]:
# Filter Presidio trf with URL detected entities on 60% Test Dataset
pii_trf = pd.read_csv('output/pii_detected_trf.csv')
pii_trf_2 = pii_trf[pii_trf['file_idx'].isin(test_indices_2)]
pii_trf_2.to_csv('output/pii_detected_trf_2.csv', index=False)

print("Filtered detected entities have been saved to 'output/pii_detected_trf_2.csv'.")
pii_trf_2

Filtered detected entities have been saved to 'output/pii_detected_trf_2.csv'.


Unnamed: 0,file_idx,entity_text,type,positions
1,7,Nathalie Sylla,NAME_STUDENT,"(52, 66)"
2,7,Nathalie Sylla,NAME_STUDENT,"(2281, 2295)"
3,7,Nathalie Sylla,NAME_STUDENT,"(3648, 3662)"
5,10,Diego Estrada,NAME_STUDENT,"(0, 13)"
6,10,Diego Estrada,NAME_STUDENT,"(2386, 2399)"
...,...,...,...,...
4336,22593,Bob,NAME_STUDENT,"(1872, 1875)"
4337,22593,Mark,NAME_STUDENT,"(1985, 1989)"
4339,22660,Amina Koko,NAME_STUDENT,"(32, 42)"
4340,22660,Amina Koko,NAME_STUDENT,"(2443, 2453)"


In [53]:
pii_trf_2_url = pii_trf_2[pii_trf_2['type'] == 'URL']
pii_trf_2_url

Unnamed: 0,file_idx,entity_text,type,positions
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)"
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)"
10,9,tool.by,URL,"(750, 757)"
22,27,whole.It,URL,"(536, 544)"
23,28,records..et,URL,"(133, 144)"
...,...,...,...,...
19795,22682,635.Th,URL,"(202, 208)"
19796,22682,challenge.Am,URL,"(335, 347)"
19797,22682,ideas.An,URL,"(419, 427)"
19798,22682,selected.Th,URL,"(574, 585)"


In [59]:
http_rows = pii_trf_2_url[pii_trf_2_url['entity_text'].str.contains('http', na=False)]
http_rows

Unnamed: 0,file_idx,entity_text,type,positions
1,5,https://www.santander.com/content/dam/santande...,URL,"(3035, 3158)"
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)"
127,154,https://www.coursera.org/learn/uva-darden-desi...,URL,"(4504, 4587)"
152,194,https://en.wikipedia.org/wiki/Storytelling,URL,"(5708, 5750)"
157,200,http://www.fao.org/kenya/fao-in-kenya/kenya-at...,URL,"(2189, 2248)"
...,...,...,...,...
19632,22425,http://designresearchtechniques.com/casestudie...,URL,"(2539, 2600)"
19637,22440,https://trello.com/,URL,"(2013, 2032)"
19638,22440,https://www.draw.io,URL,"(2424, 2443)"
19678,22492,https://www.youtube.com/watch?v=DCUcrfBKyeL,URL,"(746, 789)"
