In [4]:
# Main file that need help with.
# Input -> Presidio -> Detected Entities -> GPT-4o -> Metric calculations
# Only do WITHIN context filteration (need to extract the sentence where the detected entity belongs to and feed to GPT).
# No need to do out-of-context filteration
import ast
import json
import pandas as pd
from openai import OpenAI
# Load the API key from input/secret.json
# secret_json = read_json_file('input/secret.json')
with open('input/secret.json', 'r', encoding='UTF-8') as file:
    secret_json = json.load(file)
api_key = secret_json['OPENAI_API_KEY']
client = OpenAI(api_key=api_key)

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

df = read_file("data/obfuscated_data_06.json")

In [19]:
# Be aware of the generated cost.

def check_pii_entities_gpt4o(detected_entities):
    # Separate entities by category
    person_entities = [entity for entity in detected_entities if entity[2] == 'PERSON']
    email_entities = [entity for entity in detected_entities if entity[2] == 'EMAIL_ADDRESS']
    url_entities = [entity for entity in detected_entities if entity[2] == 'URL']
    phone_entities = [entity for entity in detected_entities if entity[2] == 'PHONE_NUMBER']

    # Function to extract the sentence that the PII appears in
    def extract_sentence(text, s, e):
        # Find the start of the sentence
        sentence_start = s
        while sentence_start > 0 and text[sentence_start - 1] not in '.!?':
            sentence_start -= 1
        
        # Find the end of the sentence
        sentence_end = e
        while sentence_end < len(text) and text[sentence_end] not in '.!?':
            sentence_end += 1
        
        # Extract and return the sentence
        return text[sentence_start:sentence_end + 1].strip()


    # Function to call GPT-4o and check for PII
    def check_entities_with_gpt(entities, category, prompt_template):
        # Prepare input text for GPT by batching entities
        prompt = prompt_template + "\n\n"
        for entity in entities:
            idx, entity_text, _, positions = entity
            (s,e) = positions
            sentence = extract_sentence(df.iloc[idx].full_text, s, e)
            
            prompt += f"({entity_text}, {sentence})\n"

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )

        gpt_results = response.choices[0].message.content.strip().split('\n')

        print(category)
        print(gpt_results)
        print("\n")

        # Process the response and keep entities marked as True (PII detected)
        filtered_entities = [
            entity for entity, result in zip(entities, gpt_results) if result.strip().upper() == 'T'
        ]
        
        return filtered_entities

    # Define prompt templates for each category
    person_prompt = "For each of the following pairs, please decide if the first value is a student's name in the context of the second value (return just a list of 'T's for Trues and 'F's for Falses):"
    email_prompt = "For each of the following pairs, please decide if the first value is a personal email address in the context of the second value (return just a list of 'T's for Trues and 'F's for Falses):"
    url_prompt = "For each of the following pairs, please decide if the first value is a personal URL in the context of the second value (return just a list of 'T's for Trues and 'F's for Falses):"
    phone_prompt = "For each of the following pairs, please decide if the first value is a personal phone number in the context of the second value (return just a list of 'T's for Trues and 'F's for Falses):"

    # Process each category with GPT-4o
    valid_person_entities = check_entities_with_gpt(person_entities, 'PERSON', person_prompt)
    valid_email_entities = check_entities_with_gpt(email_entities, 'EMAIL_ADDRESS', email_prompt)
    valid_url_entities = check_entities_with_gpt(url_entities, 'URL', url_prompt)
    valid_phone_entities = check_entities_with_gpt(phone_entities, 'PHONE_NUMBER', phone_prompt)

    print(valid_url_entities)
    print("\n")
    
    # Combine all valid entities into a single list
    valid_entities = valid_person_entities + valid_email_entities + valid_url_entities + valid_phone_entities
    
    # TODO: Might need to sort based on file_idx (first item in tuple) before return.
    return valid_entities


detected_entities = []
# with open('output/pii_detected_trf_filtered.txt', 'r') as file:
#     for line in file:
#         entity = ast.literal_eval(line.strip())
#         detected_entities.append(entity)

with open('output/pii_detected_trf_filtered.txt', 'r') as file:
        for line in file:
            entity = ast.literal_eval(line.strip())
            idx, _, _, _ = entity
            if idx < 8: # Start from a few examples to try
                detected_entities.append(entity)

# Call the function to filter valid entities
filtered_entities = check_pii_entities_gpt4o(detected_entities)
print(f"Filtered entities: {filtered_entities}")

# Save the filtered out entities in 'output/pii_detected_gpt.txt' or any suitable file name.
with open('output/pii_detected_gpt.txt', 'w') as f:
    for entity in filtered_entities:
        f.write(f"{entity}\n")


PERSON
['T', 'T', 'F', 'F', 'T', 'T']


EMAIL_ADDRESS
['Of course, please provide the pairs for me to evaluate.']


URL
['T', 'T']


PHONE_NUMBER
['Sure, please provide the pairs you would like me to evaluate.']


[(5, 'https://www.santander.com/content/dam/santander-com/es/documentos/informe-anual-de-sostenibilidad/2019/ias-2019-informe-de-', 'URL', (3035, 3158)), (5, 'https://www.greatplacetowork.com/resources/blog/why-is-diversity-inclusion-in-the-workplace-important', 'URL', (4150, 4251))]


Filtered entities: [(0, 'Angela Meyer', 'PERSON', (1039, 1051)), (7, 'Nathalie Sylla', 'PERSON', (52, 66)), (7, 'Nathalie Sylla', 'PERSON', (2281, 2295)), (7, 'Nathalie Sylla', 'PERSON', (3648, 3662)), (5, 'https://www.santander.com/content/dam/santander-com/es/documentos/informe-anual-de-sostenibilidad/2019/ias-2019-informe-de-', 'URL', (3035, 3158)), (5, 'https://www.greatplacetowork.com/resources/blog/why-is-diversity-inclusion-in-the-workplace-important', 'URL', (4150, 4251))]


In [11]:
# Example of loading a text file
import pandas as pd

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

df = read_file("data/obfuscated_data_06.json")
print(df.shape)
print(list(df.columns))

(22688, 5)
['full_text', 'document', 'tokens', 'trailing_whitespace', 'labels']


In [12]:
# Example of loading a text file (Continued)
file_idx = 22599
input_text = df.iloc[file_idx].full_text
print(input_text)

Tool: Visualization (Visual Thinking) – Module 1

Challenge & Selection:

The tool that I have used and selected for challenge is Visualization that I feel plays the major role in  the entire process of design thinking tools. I work in a creative domain where the bulk of my work  depends on the visual representation of any product or approach or business deal. We work on logos,  presentation, graphic designs, illustration etc., To outline the challenge, we’ve started getting more of  negative surveys ratings on the output which we have shared in the recent days. Then I called for a  stakeholder meeting and wanted to use the visual thinking approach to discuss regarding the  challenge and to come up with different solutions. To get those solutions, I use this approach to  make everyone to contribute and think outside box and come up with a new suggestions based on  the data which is available and apart from the data what are other human centric reasons for the dip  in the survey. To ela

In [None]:
# If you can record the cost of provide a reasonable estimate of the cost generated from GPT, that'll be great.

In [None]:
# When you finish, please go to evaluator.ipynb and run  calculate_metrics('output/pii_detected_gpt.txt')
# and go to evaluator_categories.ipynb and run  evaluate_detected_entities('output/pii_detected_gpt.txt')
# The two lines of code are marked with TODO.