In [7]:
import pandas as pd
import re
import openai
import json

In [None]:
# !%pip install --upgrade openai

In [56]:
# Set your OpenAI API key
openai.api_key = 'your-api-key'

# Function to generate prompts dynamically
def get_prompt(category):
    name_keywords = ["name", "teacher", "student", "friend", "husband", "child", "son", "daughter", "cousin", "sister", "brother", 
                     "instructor", "colleague", "partner", "employee", "researcher", "school", "account", "resort"]
    for keyword in name_keywords:
        if keyword in category.lower():
            return f"Generate a {category.lower()}. Just give me one name with no additional output. Do not output anything like a chatbot, I only need the result without any other unnecessary outputs. Do NOT end your output with a punctuation mark."
    
    # Default prompt for non-name categories
    return f"Generate a {category.lower()}. Just give me one result with no additional output. Do not output anything like a chatbot, I only need the result without any other unnecessary outputs. Do NOT end your output with a punctuation mark."

# Function to generate synthetic data
def generate_synthetic_data(category):
    prompt = get_prompt(category)
    
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content.strip()

# Function to replace placeholders and record locations
def replace_placeholders(text):
    pattern = re.compile(r'<([^>]+)>')
    placeholder_locations = []
    synthetic_data = []
    replaced_text = text
    offset = 0

    for match in pattern.finditer(text):
        placeholder = match.group(1)
        placeholder_locations.append((match.start(), match.end(), placeholder))
        start = match.start() + offset
        end = match.end() + offset
        synthetic_value = generate_synthetic_data(placeholder)
        
        # Record the synthetic PII positions
        replaced_text = replaced_text[:start] + synthetic_value + replaced_text[end:]
        synthetic_data.append((start, start + len(synthetic_value), placeholder, synthetic_value))
        offset += len(synthetic_value) - (end - start)

    return replaced_text, placeholder_locations, synthetic_data

# Function to highlight PII instances in green
def highlight_text(text, locations, is_synthetic=False):
    highlighted_text = ""
    last_end = 0
    
    for loc in locations:
        if is_synthetic:
            start, end, category, value = loc
        else:
            start, end, category = loc
        highlighted_text += text[last_end:start]
        highlighted_text += f'\x1b[6;30;42m{text[start:end]}\x1b[0m'
        last_end = end
    
    highlighted_text += text[last_end:]
    return highlighted_text

# Function to highlight original placeholders
def highlight_placeholders(text, placeholder_locations):
    return highlight_text(text, placeholder_locations)

# Function to highlight synthetic PII
def highlight_synthetic_pii(text, synthetic_data):
    return highlight_text(text, synthetic_data, is_synthetic=True)

# Example text with placeholders
text = """Hi <TEACHER>, please contact <STUDENT> at <EMAIL ADDRESS>. My date of birth is <DOB> and I live at <ADDRESS>. 
You can call me at <TELEPHONE>. <STUDENT> has an appointment with <TEACHER> tomorrow."""

# Example usage
# text = "This is a great effort, <NAME>. <NAME>"
replaced_text, placeholder_locations, synthetic_info = replace_placeholders(text)
highlighted_original = highlight_placeholders(text, placeholder_locations)
highlighted_text = highlight_synthetic_pii(replaced_text, synthetic_info)

In [57]:
# Print original and replaced highlighted texts
print(f"placeholder_locations: {placeholder_locations}")
print(f"synthetic_info: {synthetic_info}\n")
print(highlighted_original)
print("---------------------------------------------------------------------------------")
print(highlighted_text)

placeholder_locations: [(3, 12, 'TEACHER'), (29, 38, 'STUDENT'), (42, 57, 'EMAIL ADDRESS'), (79, 84, 'DOB'), (99, 108, 'ADDRESS'), (130, 141, 'TELEPHONE'), (143, 152, 'STUDENT'), (177, 186, 'TEACHER')]
synthetic_info: [(3, 14, 'TEACHER', 'Mr. Johnson'), (31, 35, 'STUDENT', 'Liam'), (39, 64, 'EMAIL ADDRESS', 'jessica.smith23@email.com'), (86, 98, 'DOB', 'June 12 1989'), (113, 128, 'ADDRESS', '72 Maple Street'), (150, 162, 'TELEPHONE', '555-867-5309'), (164, 169, 'STUDENT', 'Peter'), (194, 207, 'TEACHER', 'Miss Anderson')]

Hi [6;30;42m<TEACHER>[0m, please contact [6;30;42m<STUDENT>[0m at [6;30;42m<EMAIL ADDRESS>[0m. My date of birth is [6;30;42m<DOB>[0m and I live at [6;30;42m<ADDRESS>[0m. 
You can call me at [6;30;42m<TELEPHONE>[0m. [6;30;42m<STUDENT>[0m has an appointment with [6;30;42m<TEACHER>[0m tomorrow.
---------------------------------------------------------------------------------
Hi [6;30;42mMr. Johnson[0m, please contact [6;30;42mLiam[0m at [6;30;42mjess

In [34]:
def check_PII_position(synthetic_info):
    for item in synthetic_info:
        start, end, category, value = item
        print(f"{replaced_text[start:end] == value}: {replaced_text[start:end]} ---- {value}")
        
check_PII_position(synthetic_info)

True: Ms. Taylor ---- Ms. Taylor
True: Hannah ---- Hannah
True: bluewave_23@email.com ---- bluewave_23@email.com
True: February 8, 1993 ---- February 8, 1993
True: 42 Willow Street ---- 42 Willow Street
True: (555) 555-5555 ---- (555) 555-5555
True: Alison ---- Alison
True: Ms. Johnson ---- Ms. Johnson


### Another example: teacherstudentchat00006.tsv

In [59]:
# # Load the processed_transcripts here
# with open('original_transcripts.txt', 'r') as f:
#     original_transcripts = json.load(f)

# # Verify the content
# print(original_transcripts[4])

teacher: Hi <STUDENT>, is that you?!
student: Hi <TEACHER>
student: I can see more people here, is it right?
teacher: Hi there, how's it going?
teacher: No it's only us I think..
student: I'm very well, thank you. How about you?
teacher: Yeah fine thanks...not so busy at the moment to be honest. I finished the teacher training course stuff
student: That's good!!
teacher: Yes, I've got to buy a car, which is fine but a bit stressful you know
teacher: I'm trying to decide between the crap tight-fisted option or a newer more xpensive car
student: Ugh, I can imagine. I think it takes time, looking for the one you like and trying different ones... (do you say 'trying' for cars?)
teacher: You say 'try out' for a car - a good phrasal verb!
student: Thank you!
teacher: But you can just say try of course
teacher: It just makes it slightly less formal
teacher: to use the phrasal verb i mean
student: I see
teacher: But very very common in speech of course
teacher: So I tried a few - 'had a test d

In [6]:
# text = original_transcripts[4]

# replaced_text, placeholder_locations, synthetic_info = replace_placeholders(text)
# highlighted_original = highlight_placeholders(text, placeholder_locations)
# highlighted_text = highlight_synthetic_pii(replaced_text, synthetic_info)

NameError: name 'original_transcripts' is not defined

In [66]:
# # Print placeholder_locations, synthetic_info, highlighted original text, highlighted replaced text
# print(f"placeholder_locations: {placeholder_locations}")
# print(f"synthetic_info: {synthetic_info}\n")
# print(highlighted_original)
# print("---------------------------------------------------------------------------------")
# print(highlighted_text)

placeholder_locations: [(12, 21, 'STUDENT'), (49, 58, 'TEACHER'), (2247, 2256, 'STUDENT'), (3664, 3673, 'STUDENT'), (4364, 4373, 'STUDENT'), (6821, 6830, 'STUDENT'), (7857, 7866, 'STUDENT'), (9949, 9958, 'STUDENT')]
synthetic_info: [(12, 17, 'STUDENT', 'Emily'), (45, 58, 'TEACHER', 'Mrs. Thompson'), (2247, 2251, 'STUDENT', 'Emma'), (3659, 3665, 'STUDENT', 'Sophia'), (4356, 4366, 'STUDENT', 'Jacqueline'), (6814, 6819, 'STUDENT', 'Elena'), (7846, 7850, 'STUDENT', 'Liam'), (9933, 9938, 'STUDENT', 'Emily')]

teacher: Hi [6;30;42m<STUDENT>[0m, is that you?!
student: Hi [6;30;42m<TEACHER>[0m
student: I can see more people here, is it right?
teacher: Hi there, how's it going?
teacher: No it's only us I think..
student: I'm very well, thank you. How about you?
teacher: Yeah fine thanks...not so busy at the moment to be honest. I finished the teacher training course stuff
student: That's good!!
teacher: Yes, I've got to buy a car, which is fine but a bit stressful you know
teacher: I'm tryi

In [67]:
# def check_PII_position(synthetic_info):
#     for item in synthetic_info:
#         start, end, category, value = item
#         print(f"{replaced_text[start:end] == value}: {replaced_text[start:end]} ---- {value}")
        
# check_PII_position(synthetic_info)

True: Emily ---- Emily
True: Mrs. Thompson ---- Mrs. Thompson
True: Emma ---- Emma
True: Sophia ---- Sophia
True: Jacqueline ---- Jacqueline
True: Elena ---- Elena
True: Liam ---- Liam
True: Emily ---- Emily


In [19]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy
import json
import re
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Create configuration containing engine name and models
def get_configuration(spaCy_model: str):
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": spaCy_model}],
    }

    return configuration

def get_conf_file(spaCy_model: str, transformer_model: str = None):
    snapshot_download(repo_id=transformer_model)
    # Instantiate to make sure it's downloaded during installation and not runtime
    AutoTokenizer.from_pretrained(transformer_model)
    AutoModelForTokenClassification.from_pretrained(transformer_model)

    config_dict = {
        "en_core_web_lg + obi/deid_roberta_i2b2": "Config/lg+roberta.yaml",
        "en_core_web_lg + StanfordAIMI/stanford-deidentifier-base": "Config/lg+stanford.yaml",
        "en_core_web_trf + obi/deid_roberta_i2b2": "Config/trf+roberta.yaml",
        "en_core_web_trf + StanfordAIMI/stanford-deidentifier-base": "Config/trf+stanford.yaml",
    }

    # Create configuration containing engine name and models
    conf_file = config_dict[spaCy_model + ' + ' + transformer_model]

    return conf_file

# Function to create NLP engine based on configuration
def create_nlp_engine(spaCy_model: str, transformer_model: str = None):
    if spaCy_model not in ["en_core_web_lg", "en_core_web_trf"]:
        raise ValueError("Input spaCy model is not supported.")
    if transformer_model is not None:
        if transformer_model not in ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"]:
            print(transformer_model)
            raise ValueError("Input transformer model is not supported.")
    
    # spaCy model only
    if transformer_model is None:
        configuration = get_configuration(spaCy_model)
        provider = NlpEngineProvider(nlp_configuration=configuration)

    # spaCy model with transformer
    else:
        conf_file = get_conf_file(spaCy_model, transformer_model)
        provider = NlpEngineProvider(conf_file=conf_file)
    
    nlp_engine = provider.create_engine()
    return nlp_engine

# Using only spaCy model
nlp_engine_spacy_only = create_nlp_engine(spaCy_model = "en_core_web_lg")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine = nlp_engine_spacy_only, # nlp_engine_spacy_only or nlp_engine_with_transformer
    supported_languages=["en", "es"]
)

In [64]:
# Change tutor's and student's names to different fake names.
# !pip install faker
from faker import Faker

# Create an allow list to exclude words from being identified as PII
allow_list = [
    "Today",
    "today",
    "Yesterday",
    "yesterday",
    "Tomorrow",
    "tomorrow"
]
    
def de_identify_pii(text_transcript):
    # Initialize the analyzer and anonymizer
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    # Define date range for generating random dates and generate a random date
    d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
    d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')
    random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')

    fake = Faker()

    # Function to generate a unique fake name
    def generate_fake_name(existing_names):
        while True:
            fake_name = names.get_first_name()
            if fake_name not in existing_names:
                return fake_name
    
    # Function to generate a unique fake email
    def generate_fake_email(fake_name):
        domains = ["gmail.com", "sina.com", "outlook.com"]
        return f"{fake_name.lower()}@{random.choice(domains)}"
    
    # Function to generate a unique fake location
    def generate_fake_location():
        return fake.city()  # Generate a fake city name using Faker

    # Function to generate a unique fake phone number
    def generate_fake_phone_number():
        return f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}"

    # Analyze the text to find PII
    results_english = analyzer.analyze(text=text_transcript, language="en", return_decision_process=True, allow_list=allow_list)
    # <PERSON>, <DATE_TIME>, ...
    
    # Create a mapping of original names to unique fake names
    name_mapping = {}
    existing_names = set()
    for result in results_english:
        if result.entity_type == "PERSON":
            original_name = text_transcript[result.start:result.end]
            if original_name not in name_mapping:
                fake_name = generate_fake_name(existing_names)
                name_mapping[original_name] = fake_name
                existing_names.add(fake_name)

    # Email mapping to ensure consistent fake emails
    email_mapping = {}
    for result in results_english:
        if result.entity_type == "EMAIL_ADDRESS":
            original_email = text_transcript[result.start:result.end]
            if original_email not in email_mapping:
                fake_name = generate_fake_name(existing_names)
                fake_email = generate_fake_email(fake_name)
                email_mapping[original_email] = fake_email
    
    # Phone number mapping to ensure consistent fake phone numbers
    phone_mapping = {}
    for result in results_english:
        if result.entity_type == "PHONE_NUMBER":
            original_phone = text_transcript[result.start:result.end]
            if original_phone not in phone_mapping:
                fake_phone = generate_fake_phone_number()
                phone_mapping[original_phone] = fake_phone

    operators = {
        "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
        "DATE_TIME": OperatorConfig("replace", {"new_value": random_date}),
        # Add more categories
        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda text: email_mapping.get(text, text)}),
        "LOCATION": OperatorConfig("replace", {"new_value": generate_fake_location()}),
        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda text: phone_mapping.get(text, text)})
    }

    # Anonymize the text
    result = anonymizer.anonymize(
        text=text_transcript,
        analyzer_results=results_english,
        operators=operators
    )

    return result, results_english


In [66]:
def highlight_text_presidio(text, entities):
    highlighted_text = ""
    last_end = 0
    
    # Sort entities by their start position
    entities = sorted(entities, key=lambda x: x['start'])
    
    for entity in entities:
        start, end = entity['start'], entity['end']
        if start >= last_end:  # Ensure there's no overlap
            # Add the text up to the current entity
            highlighted_text += text[last_end:start]
            # Highlight the current entity
            highlighted_text += '\x1b[6;30;42m' + f"[{text[start:end]}]" + '\x1b[0m'
            last_end = end
    
    # Add any remaining text after the last entity
    highlighted_text += text[last_end:]
    return highlighted_text

def highlight_preserving_format(text, entities):
    # Highlight the text
    highlighted_text = highlight_text_presidio(text, entities)
    
    # Split the highlighted text into lines to preserve original formatting
    highlighted_lines = highlighted_text.split('\n')
    
    # Join the lines back into a single string with newline characters
    return '\n'.join(highlighted_lines)

def highlight_orig_text(text_transcript, results_english):
    entities_info = []

    for res in results_english:
        entities_info.append({'type':res.entity_type, 'start':res.start, 'end':res.end})

    # Ensure entities are sorted by their start position
    entities_info.sort(key=lambda x: x["start"])

    # Highlight the text
    highlighted_orig = highlight_preserving_format(text_transcript, entities_info)

    # Printing the highlighted text
    return highlighted_orig

def hightlight_replaced_text(result):
    results_english_replaced = analyzer.analyze(text=result.text, language="en", 
                        return_decision_process=True, allow_list=allow_list)
    
    entities_info_replaced = []

    for res in results_english_replaced:
        entities_info_replaced.append({'type':res.entity_type, 'start':res.start, 'end':res.end})

    # Ensure entities are sorted by their start position
    entities_info_replaced.sort(key=lambda x: x["start"])

    # Highlight the text
    highlighted_replaced = highlight_preserving_format(result.text, entities_info_replaced)

    highlighted_replaced = re.sub(r'(<[^>]+>)', r'\033[42;30m\1\033[0m', highlighted_replaced)

    # Printing the highlighted text
    return highlighted_replaced

def insert_space(s, i):
    return s[:i] + ' ' + s[i:]

def remove_possible_url(s):
    spaces = []
    for i in range(len(s)-1):
        if s[i] == '.' and s[i+1].isalpha():
            spaces.append(i+1)

    for space in reversed(spaces):  # reversed to not mess up indices
        s = insert_space(s, space)

    return s

In [61]:
print(replaced_text)
print("---------------------------------------------------------------------------------")

Hi Mr. Johnson, please contact Liam at jessica.smith23@email.com. My date of birth is June 12 1989 and I live at 72 Maple Street. 
You can call me at 555-867-5309. Peter has an appointment with Miss Anderson tomorrow.
---------------------------------------------------------------------------------


In [62]:
# replaced_text2 = remove_possible_url(replaced_text)
result, results_english = de_identify_pii(replaced_text)
highlighted_replaced = hightlight_replaced_text(result)
print(highlighted_replaced)

text: Hi Mr. Sheila, please contact Joseph at Kelly. Jacqueline. com. My date of birth is 07/31/2008 and I live at 72 Maple Street. 
You can call me at 555-553-2280. Tammy has an appointment with Miss Alice tomorrow.
items:
[
    {'start': 195, 'end': 200, 'entity_type': 'PERSON', 'text': 'Alice', 'operator': 'custom'},
    {'start': 160, 'end': 165, 'entity_type': 'PERSON', 'text': 'Tammy', 'operator': 'custom'},
    {'start': 146, 'end': 158, 'entity_type': 'PHONE_NUMBER', 'text': '555-553-2280', 'operator': 'custom'},
    {'start': 84, 'end': 94, 'entity_type': 'DATE_TIME', 'text': '07/31/2008', 'operator': 'replace'},
    {'start': 47, 'end': 57, 'entity_type': 'PERSON', 'text': 'Jacqueline', 'operator': 'custom'},
    {'start': 40, 'end': 45, 'entity_type': 'PERSON', 'text': 'Kelly', 'operator': 'custom'},
    {'start': 30, 'end': 36, 'entity_type': 'PERSON', 'text': 'Joseph', 'operator': 'custom'},
    {'start': 7, 'end': 13, 'entity_type': 'PERSON', 'text': 'Sheila', 'operator':

### Start from here:
1. Run Presidio on toy example + one example transcript
2. Get the output start and end indices of Presidio-identified PII (from result_english)
3. Highlight them as long as the original replaced_text (see above code chunks for highlighting and checking)
4. Upload the updated code to GitHub main branch by Tuesday night and notify everyone in the group on Slack.
5. Compare the two start and end indices from original result and presidio-identified result by printing them out and eyeballing them.
6. Calculate the TP, FP, FN, TN, and thus Recall (R), Precision (P), and F1 Score using exact index matching (both start and end indices have to be exactly the same in order to be considered as True Positive).
7. Examine more closely the start and end indices from both results - Consider whether this scenario happens: True PII: "Sam Altman", Presidio-detected: "Sam" and "Altman" separately, or only "Sam", or only "Altman".
8. Think about how to deal with this situation if it happens and think about whether tokenization is necessary.
9. Calculate the TP, FP, FN, TN, and thus Recall (R), Precision (P), and F1 Score again.