In [1]:
import pandas as pd
import re
import openai
import json

In [None]:
# %pip install --upgrade openai

In [2]:
# Set your OpenAI API key
openai.api_key = 'your-openai-key'

# Function to generate prompts dynamically
def get_prompt(category):
    name_keywords = ["name", "teacher", "student", "friend", "husband", "child", "son", "daughter", "cousin", "sister", "brother", 
                     "instructor", "colleague", "partner", "employee", "researcher", "school", "account", "resort"]
    for keyword in name_keywords:
        if keyword in category.lower():
            return f"Generate a {category.lower()}. Just give me one name with no additional output. Do not output anything like a chatbot, I only need the result without any other unnecessary outputs. Do NOT end your output with a punctuation mark."
    
    # Default prompt for non-name categories
    return f"Generate a {category.lower()}. Just give me one result with no additional output. Do not output anything like a chatbot, I only need the result without any other unnecessary outputs. Do NOT end your output with a punctuation mark."

# Function to generate synthetic data
def generate_synthetic_data(category):
    prompt = get_prompt(category)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

# Function to replace placeholders and record locations
def replace_placeholders(text):
    pattern = re.compile(r'<([^>]+)>')
    placeholder_locations = []
    synthetic_data = []
    replaced_text = text
    offset = 0

    for match in pattern.finditer(text):
        placeholder = match.group(1)
        placeholder_locations.append((match.start(), match.end(), placeholder))
        start = match.start() + offset
        end = match.end() + offset
        synthetic_value = generate_synthetic_data(placeholder)
        
        # Record the synthetic PII positions
        replaced_text = replaced_text[:start] + synthetic_value + replaced_text[end:]
        synthetic_data.append((start, start + len(synthetic_value), placeholder, synthetic_value))
        offset += len(synthetic_value) - (end - start)

    return replaced_text, placeholder_locations, synthetic_data

# Function to highlight PII instances in green
def highlight_text(text, locations, is_synthetic=False):
    highlighted_text = ""
    last_end = 0
    
    for loc in locations:
        if is_synthetic:
            start, end, category, value = loc
        else:
            start, end, category = loc
        highlighted_text += text[last_end:start]
        highlighted_text += f'\x1b[6;30;42m{text[start:end]}\x1b[0m'
        last_end = end
    
    highlighted_text += text[last_end:]
    return highlighted_text

# Function to highlight original placeholders
def highlight_placeholders(text, placeholder_locations):
    return highlight_text(text, placeholder_locations)

# Function to highlight synthetic PII
def highlight_synthetic_pii(text, synthetic_data):
    return highlight_text(text, synthetic_data, is_synthetic=True)

# Example usage
text = """Hi <TEACHER>, please contact <STUDENT> at <EMAIL ADDRESS>. My date of birth is <DOB> and I live at <ADDRESS>. 
You can call me at <TELEPHONE>. <STUDENT> has an appointment with <TEACHER> tomorrow."""

replaced_text, placeholder_locations, synthetic_info = replace_placeholders(text)
highlighted_original = highlight_placeholders(text, placeholder_locations)
highlighted_text = highlight_synthetic_pii(replaced_text, synthetic_info)

In [3]:
# Print original and replaced highlighted texts
print(f"placeholder_locations: {placeholder_locations}")
print(f"synthetic_info: {synthetic_info}\n")
print(highlighted_original)
print("-----------------------------------------------------------------------------------------")
print(replaced_text)
print("-----------------------------------------------------------------------------------------")
print(highlighted_text)

placeholder_locations: [(3, 12, 'TEACHER'), (29, 38, 'STUDENT'), (42, 57, 'EMAIL ADDRESS'), (79, 84, 'DOB'), (99, 108, 'ADDRESS'), (130, 141, 'TELEPHONE'), (143, 152, 'STUDENT'), (177, 186, 'TEACHER')]
synthetic_info: [(3, 15, 'TEACHER', 'Mr. Thompson'), (32, 38, 'STUDENT', 'Sophia'), (42, 65, 'EMAIL ADDRESS', 'springtime123@email.com'), (87, 103, 'DOB', 'January 15, 1992'), (118, 131, 'ADDRESS', '24 Oak Street'), (153, 167, 'TELEPHONE', '1-855-555-0199'), (169, 174, 'STUDENT', 'Emily'), (199, 211, 'TEACHER', 'Ms. Anderson')]

Hi [6;30;42m<TEACHER>[0m, please contact [6;30;42m<STUDENT>[0m at [6;30;42m<EMAIL ADDRESS>[0m. My date of birth is [6;30;42m<DOB>[0m and I live at [6;30;42m<ADDRESS>[0m. 
You can call me at [6;30;42m<TELEPHONE>[0m. [6;30;42m<STUDENT>[0m has an appointment with [6;30;42m<TEACHER>[0m tomorrow.
-----------------------------------------------------------------------------------------
Hi Mr. Thompson, please contact Sophia at springtime123@email.com. M

In [4]:
def check_PII_position(synthetic_info):
    for item in synthetic_info:
        start, end, category, value = item
        print(f"{replaced_text[start:end] == value}: {replaced_text[start:end]} ---- {value}")
        
check_PII_position(synthetic_info)

True: Mr. Thompson ---- Mr. Thompson
True: Sophia ---- Sophia
True: springtime123@email.com ---- springtime123@email.com
True: January 15, 1992 ---- January 15, 1992
True: 24 Oak Street ---- 24 Oak Street
True: 1-855-555-0199 ---- 1-855-555-0199
True: Emily ---- Emily
True: Ms. Anderson ---- Ms. Anderson


### Another example: teacherstudentchat00006.tsv

In [None]:
# # Load the processed_transcripts here
# with open('original_transcripts.txt', 'r') as f:
#     original_transcripts = json.load(f)

# # Verify the content
# print(original_transcripts[4])

In [None]:
# text = original_transcripts[4]

# replaced_text, placeholder_locations, synthetic_info = replace_placeholders(text)
# highlighted_original = highlight_placeholders(text, placeholder_locations)
# highlighted_text = highlight_synthetic_pii(replaced_text, synthetic_info)

In [None]:
# # Print placeholder_locations, synthetic_info, highlighted original text, highlighted replaced text
# print(f"placeholder_locations: {placeholder_locations}")
# print(f"synthetic_info: {synthetic_info}\n")
# print(highlighted_original)
# print("-----------------------------------------------------------------------------------------")
# print(highlighted_text)

In [None]:
# def check_PII_position(synthetic_info):
#     for item in synthetic_info:
#         start, end, category, value = item
#         print(f"{replaced_text[start:end] == value}: {replaced_text[start:end]} ---- {value}")
        
# check_PII_position(synthetic_info)

### Presidio Implementation Here

In [7]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy
import json
import re
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Create configuration containing engine name and models
def get_configuration(spaCy_model: str):
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": spaCy_model}],
    }

    return configuration

def get_conf_file(spaCy_model: str, transformer_model: str = None):
    snapshot_download(repo_id=transformer_model)
    # Instantiate to make sure it's downloaded during installation and not runtime
    AutoTokenizer.from_pretrained(transformer_model)
    AutoModelForTokenClassification.from_pretrained(transformer_model)

    config_dict = {
        "en_core_web_lg + obi/deid_roberta_i2b2": "Config/lg+roberta.yaml",
        "en_core_web_lg + StanfordAIMI/stanford-deidentifier-base": "Config/lg+stanford.yaml",
        "en_core_web_trf + obi/deid_roberta_i2b2": "Config/trf+roberta.yaml",
        "en_core_web_trf + StanfordAIMI/stanford-deidentifier-base": "Config/trf+stanford.yaml",
    }

    # Create configuration containing engine name and models
    conf_file = config_dict[spaCy_model + ' + ' + transformer_model]

    return conf_file

# Function to create NLP engine based on configuration
def create_nlp_engine(spaCy_model: str, transformer_model: str = None):
    if spaCy_model not in ["en_core_web_lg", "en_core_web_trf"]:
        raise ValueError("Input spaCy model is not supported.")
    if transformer_model is not None:
        if transformer_model not in ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"]:
            print(transformer_model)
            raise ValueError("Input transformer model is not supported.")
    
    # spaCy model only
    if transformer_model is None:
        configuration = get_configuration(spaCy_model)
        provider = NlpEngineProvider(nlp_configuration=configuration)

    # spaCy model with transformer
    else:
        conf_file = get_conf_file(spaCy_model, transformer_model)
        provider = NlpEngineProvider(conf_file=conf_file)
    
    nlp_engine = provider.create_engine()
    return nlp_engine

# Using only spaCy model
nlp_engine_spacy_only = create_nlp_engine(spaCy_model = "en_core_web_lg")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine = nlp_engine_spacy_only, # nlp_engine_spacy_only or nlp_engine_with_transformer
    supported_languages=["en", "es"]
)

In [49]:
# Change tutor's and student's names to different fake names.
# !pip install faker
from faker import Faker

# Create an allow list to exclude words from being identified as PII
allow_list = [
    "Today",
    "today",
    "Yesterday",
    "yesterday",
    "Tomorrow",
    "tomorrow"
]
    
def de_identify_pii(text_transcript):
    # Initialize the analyzer and anonymizer
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    # Define date range for generating random dates and generate a random date
    d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
    d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')
    random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')

    fake = Faker()

    # Function to generate a unique fake name
    def generate_fake_name(existing_names, first_name):
        if first_name:
            while True:
                fake_name = names.get_first_name()
                if fake_name not in existing_names:
                    return fake_name
        else:
            while True:
                fake_name = names.get_last_name()
                if fake_name not in existing_names:
                    return fake_name
    
    # Function to generate a unique fake email
    def generate_fake_email(fake_name):
        domains = ["gmail.com", "sina.com", "outlook.com"]
        return f"{fake_name.lower()}@{random.choice(domains)}"
    
    # Function to generate a unique fake location
    def generate_fake_location():
        return fake.city()  # Generate a fake city name using Faker

    # Function to generate a unique fake phone number
    def generate_fake_phone_number():
        return f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}"

    # Analyze the text to find PII
    results_analyzed = analyzer.analyze(text=text_transcript, language="en", return_decision_process=True, allow_list=allow_list)
    
    # Modify results_analyzed to include Ms., Mrs., and Mr.
    for result in results_analyzed:
        if result.entity_type == "PERSON":
            s = result.start
            
             # Checking if Ms., Mrs., or Mr. comes before the name
            if (s-4 >= 0) and (text_transcript[s-4:s] == "Ms. " or text_transcript[s-4:s] == "Mr. "):
                result.start = s-4
            elif (s-5 >= 0) and text_transcript[s-5:s] == "Mrs. ":
                result.start = s-5
                
    # Create a mapping of original names to unique fake names
    name_mapping = {}
    existing_names = set()
    for result in results_analyzed:
        if result.entity_type == "PERSON":
            original_name = text_transcript[result.start:result.end]
            first_name = True
            
            if original_name.startswith("Ms. ") or original_name.startswith("Mr. ") or original_name.startswith("Mrs. "):
                first_name = False
               
            if original_name not in name_mapping:
                fake_name = generate_fake_name(existing_names, first_name)
                
                if first_name:
                    name_mapping[original_name] = fake_name
                else:
                    titles = ["Ms.", "Mr.", "Mrs."]
                    name_mapping[original_name] = random.choice(titles) + " " + fake_name
                
                existing_names.add(fake_name)
    
    # Email mapping to ensure consistent fake emails
    email_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "EMAIL_ADDRESS":
            original_email = text_transcript[result.start:result.end]
            if original_email not in email_mapping:
                fake_name = generate_fake_name(existing_names, True)
                fake_email = generate_fake_email(fake_name)
                email_mapping[original_email] = fake_email
    
    # Phone number mapping to ensure consistent fake phone numbers
    phone_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "PHONE_NUMBER":
            original_phone = text_transcript[result.start:result.end]
            if original_phone not in phone_mapping:
                fake_phone = generate_fake_phone_number()
                phone_mapping[original_phone] = fake_phone

    operators = {
        "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
        "DATE_TIME": OperatorConfig("replace", {"new_value": random_date}),
        # Add more categories
        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda text: email_mapping.get(text, text)}),
        "LOCATION": OperatorConfig("replace", {"new_value": generate_fake_location()}),
        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda text: phone_mapping.get(text, text)})
    }

    # Anonymize the text
    results_anonymized = anonymizer.anonymize(
        text=text_transcript,
        analyzer_results=results_analyzed,
        operators=operators
    )

    return results_analyzed, results_anonymized


In [15]:
def insert_space(s, i):
    return s[:i] + ' ' + s[i:]

def remove_possible_url(s):
    spaces = []
    for i in range(len(s)-1):
        if i > 1 and s[i-2:i] == '...' and s[i+1].isalpha():
            spaces.append(i+1)

    for space in reversed(spaces):  # reversed to not mess up indices
        s = insert_space(s, space)

    return s

In [16]:
# Print text before and after remove_possible_url()
print(replaced_text)
print("-----------------------------------------------------------------------------------------")
replaced_text2 = remove_possible_url(replaced_text)
print(replaced_text2)

Hi Mr. Thompson, please contact Sophia at springtime123@email.com. My date of birth is January 15, 1992 and I live at 24 Oak Street. 
You can call me at 1-855-555-0199. Emily has an appointment with Ms. Anderson tomorrow.
-----------------------------------------------------------------------------------------
Hi Mr. Thompson, please contact Sophia at springtime123@email.com. My date of birth is January 15, 1992 and I live at 24 Oak Street. 
You can call me at 1-855-555-0199. Emily has an appointment with Ms. Anderson tomorrow.


In [50]:
results_analyzed, results_anonymized = de_identify_pii(replaced_text2)

In [53]:
# Print results_analyzed and results_anonymized
# results_analyzed is a list
print(results_analyzed)
for res in results_analyzed:
    print(res)
print("-----------------------------------------------------------------------------------------")
print(results_anonymized)

[type: EMAIL_ADDRESS, start: 42, end: 65, score: 1.0, type: PERSON, start: 3, end: 15, score: 0.85, type: PERSON, start: 32, end: 38, score: 0.85, type: DATE_TIME, start: 87, end: 103, score: 0.85, type: PERSON, start: 169, end: 174, score: 0.85, type: PERSON, start: 199, end: 211, score: 0.85, type: URL, start: 56, end: 65, score: 0.5, type: PHONE_NUMBER, start: 153, end: 167, score: 0.4, type: IN_PAN, start: 153, end: 163, score: 0.05]
type: EMAIL_ADDRESS, start: 42, end: 65, score: 1.0
type: PERSON, start: 3, end: 15, score: 0.85
type: PERSON, start: 32, end: 38, score: 0.85
type: DATE_TIME, start: 87, end: 103, score: 0.85
type: PERSON, start: 169, end: 174, score: 0.85
type: PERSON, start: 199, end: 211, score: 0.85
type: URL, start: 56, end: 65, score: 0.5
type: PHONE_NUMBER, start: 153, end: 167, score: 0.4
type: IN_PAN, start: 153, end: 163, score: 0.05
-----------------------------------------------------------------------------------------
text: Hi Ms. Brannen, please contact

In [54]:
# Print Presidio-identifed PII from results_analyzed.
# Notice email.com is separately detected.
# TODO: How to deal with overlapping identified PII? Should we just remove the smaller one from results_analyzed?

# Current solution: remove overlapping entities only from the extracted positions of results_analyzed but not from results_analyzed itself
# Relevent code (two code chunks below):
# positions_analyzed = extract_positions(results_analyzed)
# positions_analyzed = remove_overlapping_entities(positions_analyzed)
for res in results_analyzed:
    print(f"PII: {replaced_text[res.start:res.end]} ---- start: {res.start} ---- end: {res.end} ---- type: {res.entity_type}")

PII: springtime123@email.com ---- start: 42 ---- end: 65 ---- type: EMAIL_ADDRESS
PII: Mr. Thompson ---- start: 3 ---- end: 15 ---- type: PERSON
PII: Sophia ---- start: 32 ---- end: 38 ---- type: PERSON
PII: January 15, 1992 ---- start: 87 ---- end: 103 ---- type: DATE_TIME
PII: Emily ---- start: 169 ---- end: 174 ---- type: PERSON
PII: Ms. Anderson ---- start: 199 ---- end: 211 ---- type: PERSON
PII: email.com ---- start: 56 ---- end: 65 ---- type: URL
PII: 1-855-555-0199 ---- start: 153 ---- end: 167 ---- type: PHONE_NUMBER
PII: 1-855-555- ---- start: 153 ---- end: 163 ---- type: IN_PAN


In [55]:
# Function to remove overlapping entities
def remove_overlapping_entities(entities):
    # Sort by start position and length (descending)
    entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
    filtered_entities = []
    last_end = -1
    
    for start, end, entity_type in entities:
        # Remove entities that overlap with previously accepted entities
        if start >= last_end:
            filtered_entities.append((start, end, entity_type))
            last_end = end
        else:
            # Check if the overlapping entity is of higher priority
            if filtered_entities and start < filtered_entities[-1][1]:
                if end - start > filtered_entities[-1][1] - filtered_entities[-1][0]:
                    filtered_entities[-1] = (start, end, entity_type)
                elif end - start == filtered_entities[-1][1] - filtered_entities[-1][0]:
                    if entity_type == "EMAIL_ADDRESS" and filtered_entities[-1][2] == "URL":
                        filtered_entities[-1] = (start, end, entity_type)

    return filtered_entities

# Function to highlight text
def highlight_text(text, locations):
    highlighted_text = ""
    last_end = 0
    
    # for start, end, _ in sorted(locations, key=lambda x: x[0]):
    for start, end, _ in locations:
        highlighted_text += text[last_end:start]
        highlighted_text += f'\x1b[6;30;42m{text[start:end]}\x1b[0m'
        last_end = end
    
    highlighted_text += text[last_end:]
    return highlighted_text

# Extract positions and types from Presidio results
def extract_positions(results):
    positions = []
    for res in results:
        positions.append((res.start, res.end, res.entity_type))
    return positions


In [56]:
# Extract positions for highlighting
positions_analyzed = extract_positions(results_analyzed)
positions_anonymized = extract_positions(results_anonymized.items)

print("positions_analyzed (before removing overlaps):")
for pos in positions_analyzed:
    print(f"PII: {replaced_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")

# Remove overlapping entities from the positions
positions_analyzed = remove_overlapping_entities(positions_analyzed)
positions_anonymized = remove_overlapping_entities(positions_anonymized)

print("-----------------------------------------------------------------------------------------")
print("positions_analyzed (after removing overlaps) and sorted:")
for pos in positions_analyzed:
    print(f"PII: {replaced_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")

positions_analyzed (before removing overlaps):
PII: springtime123@email.com ---- start: 42 ---- end: 65 ---- type: EMAIL_ADDRESS
PII: Mr. Thompson ---- start: 3 ---- end: 15 ---- type: PERSON
PII: Sophia ---- start: 32 ---- end: 38 ---- type: PERSON
PII: January 15, 1992 ---- start: 87 ---- end: 103 ---- type: DATE_TIME
PII: Emily ---- start: 169 ---- end: 174 ---- type: PERSON
PII: Ms. Anderson ---- start: 199 ---- end: 211 ---- type: PERSON
PII: email.com ---- start: 56 ---- end: 65 ---- type: URL
PII: 1-855-555-0199 ---- start: 153 ---- end: 167 ---- type: PHONE_NUMBER
PII: 1-855-555- ---- start: 153 ---- end: 163 ---- type: IN_PAN
-----------------------------------------------------------------------------------------
positions_analyzed (after removing overlaps) and sorted:
PII: Mr. Thompson ---- start: 3 ---- end: 15 ---- type: PERSON
PII: Sophia ---- start: 32 ---- end: 38 ---- type: PERSON
PII: springtime123@email.com ---- start: 42 ---- end: 65 ---- type: EMAIL_ADDRESS
PII: Ja

In [57]:
# Highlighting the text
highlighted_analyzed_text = highlight_text(replaced_text, positions_analyzed)
highlighted_anonymized_text = highlight_text(results_anonymized.text, positions_anonymized)

print("Highlighted True Text:")
print(highlighted_text)
print("-----------------------------------------------------------------------------------------")
print("Highlighted Presidio Identified PIIs:")
print(highlighted_analyzed_text)
print("-----------------------------------------------------------------------------------------")
print("Highlighted Anonymized Text (Not a focus for RQ1):")
print(highlighted_anonymized_text)

Highlighted True Text:
Hi [6;30;42mMr. Thompson[0m, please contact [6;30;42mSophia[0m at [6;30;42mspringtime123@email.com[0m. My date of birth is [6;30;42mJanuary 15, 1992[0m and I live at [6;30;42m24 Oak Street[0m. 
You can call me at [6;30;42m1-855-555-0199[0m. [6;30;42mEmily[0m has an appointment with [6;30;42mMs. Anderson[0m tomorrow.
-----------------------------------------------------------------------------------------
Highlighted Presidio Identified PIIs:
Hi [6;30;42mMr. Thompson[0m, please contact [6;30;42mSophia[0m at [6;30;42mspringtime123@email.com[0m. My date of birth is [6;30;42mJanuary 15, 1992[0m and I live at 24 Oak Street. 
You can call me at [6;30;42m1-855-555-0199[0m. [6;30;42mEmily[0m has an appointment with [6;30;42mMs. Anderson[0m tomorrow.
-----------------------------------------------------------------------------------------
Highlighted Anonymized Text (Not a focus for RQ1):
Hi [6;30;42mMs. Brannen[0m, please contact [6;30;42

From here, we can already see that Presidio cannot detect the address 29 Acacia Road in this example, which should be closely examined. Presidio supposed to be capable of identifying such address.

We can also see that Presidio only detects 'Rodriguez' instead of 'Ms. Rodriguez'. Is "Ms." considered as PII? Does it leak the gender? Should we say that Presidio successfully detects this PII? Same thing for 'Jacobs' and 'Mr. Jacobs'. Notice this is the concern in Step 8.

### Start from here:
1. (Completed) Run Presidio (the default model) on toy example + one example transcript
2. (Completed) Get the output start and end indices of Presidio-identified PII (from results_analyzed)
3. (Completed) Highlight them as long as the original replaced_text (see above code chunks for highlighting and checking)
4. (Completed) Upload the updated code to GitHub main branch by **Monday night (EST)** and send a message in our group Slack channel.
5. (Almost Completed) Compare the two start and end indices from original result and presidio-identified result by printing them out and eyeballing them.
6. (Completed) By **Tuesday Night (EST)**: Calculate the TP, FP, FN, TN, and thus Recall (R), Precision (P), and F1 Score using **exact index matching** (both start and end indices have to be exactly the same in order to be considered as True Positive). - **Aim to see these results at our next internal group meeting on Wednesday**.
7. Run more TSCC transcripts and check whether GPT-3.5 generates some weird outputs for \<Categories\> by checking *value* of *synthetic_info* (the fourth item of *synthetic_info*: *synthetic_info[3]*).
8. Examine more closely the start and end indices from both results - Consider whether this scenario happens: True PII: "Sam Altman", Presidio-detected: "Sam" and "Altman" separately, or only "Sam", or only "Altman".
9. Think about how to deal with this situation if it happens and think about whether tokenization is necessary. (Can imply double check mechanism: Combine “\<NAME\> \<NAME\>”, notice there’s only one space in between)
10. Calculate the TP, FP, FN, TN, and thus Recall (R), Precision (P), and F1 Score again.
11. Connect Presidio with other Transformers and make sure code runs successfully.

### Task for 7.3 ~ 7.10
- Look at why address is not detected by Presidio (Step 5, eyeballing issues) - OL
- Mr. and Ms. issue; Explore on tokenization (inside or outside Presidio) (Step **8 & 9** & 10) - AL
- Try connecting Presidio with other Transformers (Step 11) - YW, JS
- PII insertion on TSCC dataset - Everyone if you have time, not top priority for this week (will be important for next week).

### Future Plans:
1. 6.28 ~ 7.5: Steps 1 to 6 + PII insertion
2. 7.6 ~ 7.12: Steps 7 ~ 9 + PII insertion + Different LLMs
3. 7.13 ~ 7.19: Run on all examples + Different LLMs
4. 7.19 ~ 7.26: Different LLMs and get the results from TSCC & MOOC (through API) datasets
5. **By 8.2: Finish RQ1**

In [58]:
# Step 6 Completed
from collections import namedtuple

# Define a  data structure to hold PII information
PIIInfo = namedtuple('PIIInfo', ['start', 'end'])

def calculate_metrics(synthetic_positions, analyzed_positions):
    tp = 0
    fp = 0
    fn = 0
    
    synthetic_set = set(synthetic_positions)
    analyzed_set = set(analyzed_positions)
    
    for analyzed in analyzed_set:
        if analyzed in synthetic_set:
            tp += 1 # true PII, detected by presidio
            print(f"TP: {analyzed}")
        else: # false PII, detected by presidio
            fp += 1
            print(f"FP: {analyzed}")
    
    for synthetic in synthetic_set:
        if synthetic not in analyzed_set:
            fn += 1
            print(f"FN: {synthetic}")
    
    # TN is not usually calculated in NER tasks, as it would require a clear definition of all non-PII text regions,
    # which can be complex. But if needed, it would be the length of the text minus all TP, FP, and FN regions.
    
    return tp, fp, fn

def compute_precision_recall_f1(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

def extract_synthetic_positions(results):
    positions = [PIIInfo(start, end) for start, end, _, _ in results]
    return positions

# Extract positions and types from Presidio results
def extract_only_positions(results):
    positions = [PIIInfo(start, end) for start, end,c in results]
    return positions

#positions_analyzed = extract_positions(results_analyzed)
analyzed_positions = extract_only_positions(positions_analyzed)
synthetic_positions = extract_synthetic_positions(synthetic_info)
# Calculate TP, FP, FN, TN
tp, fp, fn = calculate_metrics(synthetic_positions, analyzed_positions)

# Compute Precision, Recall, and F1 Score
precision, recall, f1 = compute_precision_recall_f1(tp, fp, fn)

print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

print(f"synthetic_info: {synthetic_info}\n")
print(positions_analyzed)

TP: PIIInfo(start=87, end=103)
TP: PIIInfo(start=153, end=167)
TP: PIIInfo(start=169, end=174)
TP: PIIInfo(start=199, end=211)
TP: PIIInfo(start=32, end=38)
TP: PIIInfo(start=3, end=15)
TP: PIIInfo(start=42, end=65)
FN: PIIInfo(start=118, end=131)
True Positives: 7
False Positives: 0
False Negatives: 1
Precision: 1.0
Recall: 0.875
F1 Score: 0.9333333333333333
synthetic_info: [(3, 15, 'TEACHER', 'Mr. Thompson'), (32, 38, 'STUDENT', 'Sophia'), (42, 65, 'EMAIL ADDRESS', 'springtime123@email.com'), (87, 103, 'DOB', 'January 15, 1992'), (118, 131, 'ADDRESS', '24 Oak Street'), (153, 167, 'TELEPHONE', '1-855-555-0199'), (169, 174, 'STUDENT', 'Emily'), (199, 211, 'TEACHER', 'Ms. Anderson')]

[(3, 15, 'PERSON'), (32, 38, 'PERSON'), (42, 65, 'EMAIL_ADDRESS'), (87, 103, 'DATE_TIME'), (153, 167, 'PHONE_NUMBER'), (169, 174, 'PERSON'), (199, 211, 'PERSON')]
