In [1]:
import pandas as pd
import json
from openai import OpenAI
from typing import List, Tuple
import tiktoken  # Tokenizer library

with open('input/secret.json', 'r', encoding='UTF-8') as file:
    secret_json = json.load(file)
api_key = secret_json['OPENAI_API_KEY']
client = OpenAI(api_key=api_key)

In [7]:
# Load the DataFrame from CSV
df_train = pd.read_csv('output/pii_detected_trf_train.csv')

# Define the category dictionary
category_dict = {
    'PERSON': "student's name",
    'EMAIL_ADDRESS': "personal email address",
    'URL': "personal URL",
    'PHONE_NUMBER': "personal phone number"
}

# Prepare the JSONL data for fine-tuning
jsonl_data = []

for index, row in df_train.iterrows():
    entity_text = row['entity_text']
    entity_type = row['type']
    # sentence = row['sentence']
    context = row['context']
    true_label = row['true_label']
    
    # Generate the structured message
    message = {
        "messages": [
            {"role": "system", "content": "You are an expert in verifying Personally Identifiable Information."},
            {"role": "user", "content": f"Determine if '{entity_text}' is a {category_dict[entity_type]} in this context: '{context}'? Output only 'T' for True or 'F' for False without additional output. Be very careful of outputting F due to PII leakage."},
            {"role": "assistant", "content": true_label}
        ]
    }
    
    # Append to the JSONL data list
    jsonl_data.append(message)

# Write to a JSONL file
output_file = 'output/pii_detected_train.jsonl'
with open(output_file, 'w') as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry) + '\n')


In [42]:
client = OpenAI(api_key="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")

# Load your CSV file
df_test = pd.read_csv('output/pii_detected_trf_test.csv')
# df_test_sample = df_test[df_test['file_idx'].isin([5,7,8,11,16,19,23,31,39,42])]

# Prepare the results list
results = []

# Define the category dictionary
category_dict = {
    'PERSON': "student's name",
    'EMAIL_ADDRESS': "personal email address",
    'URL': "personal URL",
    'PHONE_NUMBER': "personal phone number"
}

total_len = len(df_test)

for index, row in df_test.iterrows():
    print(f"Processing row {index}/{total_len}:")
    entity_text = row['entity_text']
    entity_type = row['type']
    context = row['context']
    true_label = row['true_label']
    
    try:
        response = client.chat.completions.create(
            model="ft:gpt-4o-mini-2024-07-18:personal::A86tyITx",
            messages=[
                {"role": "system", "content": "You are an expert in verifying Personally Identifiable Information."},
                {"role": "user", "content": f"Determine if '{entity_text}' is a {category_dict[entity_type]} in this context: '{context}'? Output only 'T' for True or 'F' for False without additional output. Be very careful of outputting F due to PII leakage."}
            ],
            temperature = 0
        )
        gpt_output = response.choices[0].message.content.strip().upper()

        print(f"Text: {entity_text}, Type: {entity_type}, True: {true_label}, Pred: {gpt_output}")
        
        # Determine if entity should be kept
        if gpt_output == 'T':
            results.append({
                'file_idx': row['file_idx'],
                'entity_text': entity_text,
                'type': entity_type,
                'positions': row['positions']
            })
        elif gpt_output == 'F':
            continue
        else:
            print(f"Unexpected response for row {index}: {gpt_output}")

    except Exception as e:
        print(f"Error processing row {index}: {e}")

# Convert results to DataFrame
filtered_df = pd.DataFrame(results)

# Save the results to a new CSV file
filtered_df.to_csv('output/pii_pre+ft_detected.csv', index=False)

print("Processing complete. Results saved to 'output/pii_pre+ft_detected.csv'.")


Processing row 0/12197:
Text: https://www.santander.com/content/dam/santander-com/es/documentos/informe-anual-de-sostenibilidad/2019/ias-2019-informe-de-, Type: URL, True: F, Pred: F
Processing row 1/12197:
Text: https://www.greatplacetowork.com/resources/blog/why-is-diversity-inclusion-in-the-workplace-important, Type: URL, True: F, Pred: F
Processing row 2/12197:
Text: Nathalie Sylla, Type: PERSON, True: T, Pred: T
Processing row 3/12197:
Text: Buzan T., Type: PERSON, True: F, Pred: F
Processing row 4/12197:
Text: Buzan B., Type: PERSON, True: F, Pred: F
Processing row 5/12197:
Text: Nathalie Sylla, Type: PERSON, True: T, Pred: T
Processing row 6/12197:
Text: Nathalie Sylla, Type: PERSON, True: T, Pred: T
Processing row 7/12197:
Text: Vanesa Chan, Type: PERSON, True: T, Pred: T
Processing row 8/12197:
Text: Aristotle, Type: PERSON, True: F, Pred: F
Processing row 9/12197:
Text: James Cook, Type: PERSON, True: T, Pred: T
Processing row 10/12197:
Text: Challenge, Type: PERSON, True: F,

In [2]:
df_test = pd.read_csv('output/pii_detected_trf_test.csv')
filtered_df = pd.read_csv('output/pii_pre+ft_detected.csv')
# Filter the rows where the true label is True in the original test dataset
true_labels = df_test[df_test['true_label'] == 'T']

# Perform a left join to find rows that exist in the true_labels but not in the GPT 4o detected dataset
false_negatives = pd.merge(
    true_labels,
    filtered_df,
    on=['file_idx', 'entity_text', 'positions'],
    how='left',
    indicator=True
)

# Filter rows where GPT 4o failed to detect the True entities (i.e., these are False Negatives)
false_negatives = false_negatives[false_negatives['_merge'] == 'left_only']

# Display the results
# import ace_tools as tools; tools.display_dataframe_to_user(name="False Negatives (PII Leakage)", dataframe=false_negatives)

false_negatives


Unnamed: 0,file_idx,entity_text,type_x,positions,true_label,sentence,context,type_y,_merge
8,22,Gianni,PERSON,"(934, 940)",T,Application\n\nAs I describe the challenge “Mo...,3. Application\n\nAs I describe the challenge ...,,left_only
38,472,https://youtu.be/rFD2lJuvace,URL,"(4886, 4914)",T,Check out here – https://youtu.be/rFD2lJuvace ...,"Also, use social media to present your stories...",,left_only
63,761,https://www.youtube.com/watch?v=Kx0SXy87bVZ,URL,"(1186, 1229)",T,The visualization process we used can be seen ...,"The second day, after 6 teams of 3-7 people ea...",,left_only
69,1105,Fatima,PERSON,"(810, 816)",T,Fatima took upon herself to be the home tutor...,Schools in India forces students to spend an i...,,left_only
73,1105,Fatima,PERSON,"(2815, 2821)",T,"Approach With the above mind mapping tool, ...",She felt her role of training drivers helped ...,,left_only
...,...,...,...,...,...,...,...,...,...
2965,20869,https://www.youtube.com/watch?v=Cl-pzEZu1xK,URL,"(2117, 2160)",T,I\n\nrecommend the Youtube Video: The magi...,His way of using storytelling and how t...,,left_only
2967,20903,Bang,PERSON,"(3110, 3114)",T,A new person named Bang was introduced at tha...,"So, he made this decision and started planning...",,left_only
2994,21228,001-204-337-0177x663,PHONE_NUMBER,"(20, 40)",T,Kimberly Alina\n\nTEL 001-204-337-0177x663\n\n...,Kimberly Alina\n\nTEL 001-204-337-0177x663\n\n...,,left_only
3012,21512,Grazia Doni,PERSON,"(79, 90)",T,STORYTELLING - CHALLENGE\n\nREDUCING THE RATE ...,STORYTELLING - CHALLENGE\n\nREDUCING THE RATE ...,,left_only
