In [1]:
import pandas as pd
import json

# Load the DataFrame from CSV
df = pd.read_csv('output/pii_detected_trf_filtered.csv')

# Define the category dictionary
category_dict = {
    'PERSON': "student's name",
    'EMAIL_ADDRESS': "personal email address",
    'URL': "personal URL",
    'PHONE_NUMBER': "personal phone number"
}

# Prepare the JSONL data for fine-tuning
jsonl_data = []

for index, row in df.iterrows():
    entity_text = row['entity_text']
    entity_type = row['type']
    sentence = row['sentence']
    true_label = row['true_label']
    
    # Generate the structured message
    message = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Determine if '{entity_text}' is a {category_dict[entity_type]} in this context: '{sentence}'? Output only 'T' for True or 'F' for False without additional output."},
            {"role": "assistant", "content": true_label}
        ]
    }
    
    # Append to the JSONL data list
    jsonl_data.append(message)

# Write to a JSONL file
output_file = 'output/pii_detected_all.jsonl'
with open(output_file, 'w') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + '\n')


In [None]:
# !pip3 install scikit-learn

In [24]:
from sklearn.model_selection import train_test_split

# Combine 'true_label' and 'type' columns to create a stratification key
df['stratify_key'] = df['true_label'] + '_' + df['type']

# Perform stratified sampling based on the combined key
train_df, test_df = train_test_split(
    df,
    test_size=0.75,  # 75% for testing, 25% for training
    stratify=df['stratify_key'],  # Stratify by the combined key
    random_state=42  # For reproducibility
)

# Store the indices of the selected rows for training data
train_indices = train_df.index.tolist()
test_indices = test_df.index.tolist()

# Count the number of 'T's and 'F's in the 'true_label' column for the selected rows
true_label_counts = train_df['true_label'].value_counts()
type_counts = train_df['type'].value_counts()

# Generate a summary table for the number of 'T's and 'F's for each 'type'
summary_table = train_df.groupby(['type', 'true_label']).size().reset_index(name='count')

# Output the results
print(f"Training indices: {train_indices}")
print(f"Testing indices: {test_indices}")
print(f"Number of 'T's in training data: {true_label_counts.get('T', 0)}")
print(f"Number of 'F's in training data: {true_label_counts.get('F', 0)}")
print(f"Type counts in training data:\n{type_counts}")
print("\nSummary of 'T' and 'F' counts for each type:")
print(summary_table)

# Optionally, remove the 'stratify_key' column after sampling
df.drop(columns=['stratify_key'], inplace=True)


Training indices: [15043, 16626, 7139, 1231, 14954, 6957, 2055, 12409, 14963, 2205, 6588, 15273, 6693, 12493, 5619, 5367, 11448, 2341, 2809, 10533, 6501, 6741, 15764, 15115, 8075, 7685, 12220, 15789, 8515, 6087, 1633, 3162, 14028, 13885, 947, 16094, 12888, 1641, 12374, 14053, 13448, 8577, 8684, 9492, 13604, 7600, 13647, 971, 10689, 10724, 10535, 12417, 2325, 16079, 3994, 5315, 9927, 3947, 9656, 14855, 9945, 3661, 4181, 4385, 1827, 14786, 15366, 4044, 13926, 11777, 7464, 8668, 2730, 15118, 14127, 7185, 5572, 11291, 13028, 13472, 250, 375, 10433, 2839, 5877, 5421, 1809, 1122, 11487, 12536, 15517, 3144, 10832, 1733, 11380, 1799, 13203, 2179, 5311, 8693, 12852, 7230, 10257, 11949, 14510, 8603, 10590, 3100, 2246, 13491, 12513, 6196, 8914, 1207, 2163, 16420, 1946, 7828, 7204, 8197, 11621, 14776, 9704, 11510, 928, 7558, 1408, 305, 5849, 453, 2224, 12446, 8285, 12494, 13208, 13488, 14041, 8511, 3030, 16320, 10338, 8059, 4384, 6927, 8590, 6546, 4986, 2460, 11927, 5028, 12085, 14607, 10219, 3155

In [29]:
# Sort the indices
sorted_train_indices = sorted(train_indices)
sorted_test_indices = sorted(test_indices)

# Print the sorted indices
print(sorted_train_indices)
print(sorted_test_indices)

# Write the sorted indices list to the output file
with open('data/train_indices.txt', 'w') as f:
    f.write(str(sorted_train_indices))

with open('data/test_indices.txt', 'w') as f:
    f.write(str(sorted_test_indices))

print(f"Sorted training indices list has been saved to respective locations.")


[0, 1, 6, 8, 9, 13, 20, 22, 26, 31, 35, 44, 46, 48, 49, 50, 52, 53, 56, 58, 65, 66, 68, 70, 74, 80, 87, 89, 93, 94, 105, 107, 111, 114, 117, 118, 124, 127, 131, 132, 135, 136, 142, 145, 147, 151, 153, 154, 167, 170, 177, 188, 191, 194, 199, 203, 211, 217, 222, 229, 242, 245, 247, 248, 250, 254, 257, 258, 262, 265, 270, 273, 286, 296, 297, 301, 303, 304, 305, 310, 317, 318, 320, 331, 351, 353, 356, 357, 360, 361, 371, 372, 374, 375, 381, 383, 385, 395, 399, 402, 403, 409, 417, 419, 421, 422, 423, 424, 426, 430, 434, 435, 440, 444, 446, 447, 450, 451, 453, 456, 465, 468, 470, 471, 483, 486, 493, 494, 497, 505, 511, 514, 517, 519, 526, 536, 537, 545, 546, 547, 548, 549, 553, 554, 555, 556, 561, 566, 568, 569, 570, 574, 577, 583, 584, 591, 594, 599, 601, 605, 609, 610, 611, 612, 613, 616, 618, 620, 625, 629, 631, 632, 635, 639, 640, 641, 644, 653, 656, 657, 667, 670, 678, 692, 694, 695, 697, 704, 707, 714, 717, 720, 728, 729, 731, 733, 735, 742, 745, 746, 751, 757, 763, 764, 766, 768, 771,

In [14]:
# Define the category dictionary
category_dict = {
    'PERSON': "student's name",
    'EMAIL_ADDRESS': "personal email address",
    'URL': "personal URL",
    'PHONE_NUMBER': "personal phone number"
}

# Filter the DataFrame to include only the selected training indices
train_df = df.loc[sorted(train_indices)]

# Prepare the JSONL data for fine-tuning
jsonl_data = []

for index, row in train_df.iterrows():
    entity_text = row['entity_text']
    entity_type = row['type']
    sentence = row['sentence']
    true_label = row['true_label']
    
    # Generate the structured message
    message = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Determine if '{entity_text}' is a {category_dict[entity_type]} in this context: '{sentence}'? Output only 'T' for True or 'F' for False without additional output."},
            {"role": "assistant", "content": true_label}
        ]
    }
    
    # Append to the JSONL data list
    jsonl_data.append(message)

# Write to a JSONL file
output_file = 'output/pii_detected_train.jsonl'
with open(output_file, 'w') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + '\n')

print(f"JSONL file has been written to {output_file}")


JSONL file has been written to output/pii_detected_train.jsonl


In [32]:
# Filter the DataFrame using the test indices
test_df = df.loc[sorted_test_indices]

# Save the filtered DataFrame to a CSV file
output_file='output/pii_detected_test.csv'
test_df.to_csv(output_file, index=False)

print(f"Test data saved to {output_file}")
test_df

Test data saved to output/pii_detected_test.csv


Unnamed: 0,file_idx,entity_text,type,positions,true_label,sentence
2,5,https://www.greatplacetowork.com/resources/blo...,URL,"(4150, 4251)",F,2 https://www.greatplacetowork.com/resources/b...
3,7,Nathalie Sylla,PERSON,"(52, 66)",T,Design Thinking for innovation reflexion-Avril...
4,7,Buzan T.,PERSON,"(263, 271)",F,According to the definition of Buzan T. and Bu...
5,7,Buzan B.,PERSON,"(276, 284)",F,"and Buzan B. (1999, Dessine-moi l'intelligence."
7,7,Nathalie Sylla,PERSON,"(3648, 3662)",T,Design Thinking for innovation reflexion-Avril...
...,...,...,...,...,...,...
16666,22661,Jake Knapp,PERSON,"(501, 511)",F,The first tool I used was the tool of Visualiz...
16667,22664,Andre Martin,PERSON,"(1539, 1551)",F,I have particularly drawn from the approach of...
16670,22676,Buddha,PERSON,"(1222, 1228)",F,Even Buddha is seen sharing stories about his ...
16671,22678,JOURNEY MAP,PERSON,"(10, 21)",F,EXAMPLE – JOURNEY MAP\n\nTHE CHALLENGE My w...
