In [1]:
# !pip install presidio_analyzer
# !pip install presidio-anonymizer
# !pip install names
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy

# Create configuration containing engine name and models
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": "en_core_web_lg"}],
}

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine_with_spanish = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine_with_spanish, 
    supported_languages=["en", "es"]
)



In [2]:


def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [3]:


d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')

In [4]:

# !python -m spacy download en_core_web_sm
# Load an existing spaCy model
nlp = spacy.load('en_core_web_sm')  # Example for English


In [5]:
# import spacy
# nlp = spacy.blank("en")
# nlp.add_pipe(
#     "hf_text_pipe",
#     config={"model": "distilbert-base-uncased-finetuned-sst-2-english"},
# )
# doc = nlp("This is great!")
# print(doc.cats)
# # {'POSITIVE': 0.9998694658279419, 'NEGATIVE': 0.00013048505934420973}

In [6]:
# import spacy
# from spacy.language import Language

# # Load an existing spaCy model
# nlp = spacy.load('en_core_web_sm')

# @Language.component("custom_component")
# def custom_component(doc):
#     # Example custom processing: print the text
#     print("Processing:", doc.text)
#     return doc

# # Add the custom component to the pipeline
# nlp.add_pipe("custom_component")

# # Use the model as usual
# doc = nlp("This is a test sentence.")


In [7]:


# # Define which model to use
# model_config = [{"lang_code": "en", "model_name": {
#     "spacy": "en_core_web_sm",  # use a small spaCy model for lemmas, tokens etc.
#     "transformers": "obi/deid_roberta_i2b2"
#     }
# }]

# # Map transformers model labels to Presidio's
# model_to_presidio_entity_mapping = dict(
#     PER="PERSON",
#     PERSON="PERSON",
#     LOC= "LOCATION",
#     LOCATION= "LOCATION",
#     GPE="LOCATION",
#     ORG="ORGANIZATION",
#     ORGANIZATION="ORGANIZATION",
#     NORP="NRP",
#     AGE="AGE",
#     ID="ID",
#     EMAIL="EMAIL",
#     PATIENT="PERSON",
#     STAFF="PERSON",
#     HOSP="ORGANIZATION",
#     PATORG="ORGANIZATION",
#     DATE="DATE_TIME",
#     TIME="DATE_TIME",
#     PHONE="PHONE_NUMBER",
#     HCW="PERSON",
#     HOSPITAL="ORGANIZATION",
#     FACILITY="LOCATION",
# )

# ner_model_configuration = NerModelConfiguration(labels_to_ignore = ["O"], 
#                                                 model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)

# nlp_engine = TransformersNlpEngine(models=model_config,
#                                    ner_model_configuration=ner_model_configuration)

# # Set up the engine, loads the NLP module (spaCy model by default) 
# # and other PII recognizers
# analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)


In [8]:
text_transcript = """
**Tutor:** "Hello, Alex! I'm Jionghao Lin, your math tutor. Today we're going to work on fractions, specifically adding and subtracting them. Are you ready?"

**Student (Alex):** "Yes, I think so. Fractions are a bit confusing, though."

**Tutor:** "No worries, we'll tackle them step by step. Let's start with adding fractions. Do you remember what we need to do first?"

**Alex:** "We need to make sure the denominators are the same, right?"

**Tutor:** "Exactly, Alex! That's very important. If we have 1/4 + 3/4, what would be the sum?"

**Alex:** "Since the denominators are the same, it would be 4/4, which is the same as one whole, right?"

**Tutor:** "Spot on! Now, what if we have different denominators, like 1/2 + 1/3?"

**Alex:** "We need to find a common denominator, right? But I'm not sure how to do that."

**Tutor:** "Correct, we need a common denominator. For 1/2 and 1/3, the smallest number that both 2 and 3 can divide into evenly is 6. So, we convert the fractions to have the denominator of 6. Can you try that?"

**Alex:** "So, 1/2 becomes 3/6, and 1/3 becomes 2/6?"

**Tutor:** "Exactly, well done! Now, what's 3/6 + 2/6?"

**Alex:** "Is it 5/6?"

**Tutor:** "Perfect! You're getting the hang of this. Let's try some subtraction next. Ready?"

**Alex:** "Yes, let's do it!"
"""

In [9]:
# result.text

In [10]:
# Analyze in different languages
# results_spanish = analyzer.analyze(text=text_transcript, language="es")
# print(results_spanish)

results_english = analyzer.analyze(text=text_transcript, language="en", 
                        return_decision_process=True)
print(results_english)

[type: PERSON, start: 20, end: 24, score: 0.85, type: PERSON, start: 30, end: 42, score: 0.85, type: DATE_TIME, start: 61, end: 66, score: 0.85, type: PERSON, start: 171, end: 175, score: 0.85, type: PERSON, start: 376, end: 380, score: 0.85, type: PERSON, start: 467, end: 471, score: 0.85, type: PERSON, start: 545, end: 549, score: 0.85, type: PERSON, start: 735, end: 739, score: 0.85, type: PERSON, start: 1040, end: 1044, score: 0.85, type: PERSON, start: 1152, end: 1156, score: 0.85, type: PERSON, start: 1272, end: 1276, score: 0.85]


In [11]:
# results_english 

random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')
random_date 

# # Function to generate a random date
# def generate_random_date():
#     start_date = datetime(2000, 1, 1)
#     end_date = datetime(2020, 1, 1)
#     return (start_date + (end_date - start_date) * random.random()).strftime("%Y-%m-%d")

'11/23/2008'

In [12]:
# Updated code: change tutor's and student's names to different fake names.

# Initialize the analyzer and anonymizer
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Define date range for generating random dates
d1 = datetime.strptime('1/1/2000', '%m/%d/%Y')
d2 = datetime.strptime('12/31/2020', '%m/%d/%Y')

# Function to generate a unique fake name
def generate_fake_name(existing_names):
    while True:
        fake_name = names.get_first_name()
        if fake_name not in existing_names:
            return fake_name

# Function to generate a random date
def generate_random_date():
    random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')
    return random_date

# Analyze the text to find PII
results_english = analyzer.analyze(text=text_transcript, entities=["PERSON", "DATE_TIME"], language="en")

# Create a mapping of original names to unique fake names
# Is creating a mapping good idea? User can easily print out the name_mapping dictionary and see the PII...
# So we cannot release code...
name_mapping = {}
existing_names = set()
for result in results_english:
    if result.entity_type == "PERSON":
        original_name = text_transcript[result.start:result.end]
        if original_name not in name_mapping:
            fake_name = generate_fake_name(existing_names)
            name_mapping[original_name] = fake_name
            existing_names.add(fake_name)

print(name_mapping)

operators = {
    "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
    # "PERSON": OperatorConfig("replace", {"new_value": random_name}),
    "DATE_TIME": OperatorConfig("replace", {"new_value": random_date})
}

# Anonymize the text
result = anonymizer.anonymize(
    text=text_transcript,
    analyzer_results=results_english,
    operators=operators
)

print(result)

{'Alex': 'Francisco', 'Jionghao Lin': 'Mary'}
text: 
**Tutor:** "Hello, Francisco! I'm Mary, your math tutor. 11/23/2008 we're going to work on fractions, specifically adding and subtracting them. Are you ready?"

**Student (Francisco):** "Yes, I think so. Fractions are a bit confusing, though."

**Tutor:** "No worries, we'll tackle them step by step. Let's start with adding fractions. Do you remember what we need to do first?"

**Francisco:** "We need to make sure the denominators are the same, right?"

**Tutor:** "Exactly, Francisco! That's very important. If we have 1/4 + 3/4, what would be the sum?"

**Francisco:** "Since the denominators are the same, it would be 4/4, which is the same as one whole, right?"

**Tutor:** "Spot on! Now, what if we have different denominators, like 1/2 + 1/3?"

**Francisco:** "We need to find a common denominator, right? But I'm not sure how to do that."

**Tutor:** "Correct, we need a common denominator. For 1/2 and 1/3, the smallest number that both

In [13]:
# Old version
# Invoke the anonymize function with the text, 
# analyzer results (potentially coming from presidio-analyzer) and
# Operators to get the anonymization output:
# result = anonymizer.anonymize(
#     text=text_transcript,
#     analyzer_results=results_english ,
#     operators={"PERSON": OperatorConfig("replace", {"new_value": names.get_first_name()}),\
#                "DATE_TIME":OperatorConfig("replace", {"new_value": random_date})},
# )

# print(result)

In [14]:
print(result.text)


**Tutor:** "Hello, Francisco! I'm Mary, your math tutor. 11/23/2008 we're going to work on fractions, specifically adding and subtracting them. Are you ready?"

**Student (Francisco):** "Yes, I think so. Fractions are a bit confusing, though."

**Tutor:** "No worries, we'll tackle them step by step. Let's start with adding fractions. Do you remember what we need to do first?"

**Francisco:** "We need to make sure the denominators are the same, right?"

**Tutor:** "Exactly, Francisco! That's very important. If we have 1/4 + 3/4, what would be the sum?"

**Francisco:** "Since the denominators are the same, it would be 4/4, which is the same as one whole, right?"

**Tutor:** "Spot on! Now, what if we have different denominators, like 1/2 + 1/3?"

**Francisco:** "We need to find a common denominator, right? But I'm not sure how to do that."

**Tutor:** "Correct, we need a common denominator. For 1/2 and 1/3, the smallest number that both 2 and 3 can divide into evenly is 6. So, we convert

In [15]:
results_english

[type: PERSON, start: 20, end: 24, score: 0.85,
 type: PERSON, start: 30, end: 42, score: 0.85,
 type: DATE_TIME, start: 61, end: 66, score: 0.85,
 type: PERSON, start: 171, end: 175, score: 0.85,
 type: PERSON, start: 376, end: 380, score: 0.85,
 type: PERSON, start: 467, end: 471, score: 0.85,
 type: PERSON, start: 545, end: 549, score: 0.85,
 type: PERSON, start: 735, end: 739, score: 0.85,
 type: PERSON, start: 1040, end: 1044, score: 0.85,
 type: PERSON, start: 1152, end: 1156, score: 0.85,
 type: PERSON, start: 1272, end: 1276, score: 0.85]

In [16]:
[(text_transcript[res.start:res.end],res.start, res.end) for res in results_english]

[('Alex', 20, 24),
 ('Jionghao Lin', 30, 42),
 ('Today', 61, 66),
 ('Alex', 171, 175),
 ('Alex', 376, 380),
 ('Alex', 467, 471),
 ('Alex', 545, 549),
 ('Alex', 735, 739),
 ('Alex', 1040, 1044),
 ('Alex', 1152, 1156),
 ('Alex', 1272, 1276)]

In [17]:
entities_info = []

for res in results_english:
    print(res)
    entities_info.append({'type':res.entity_type, 'start':res.start, 'end':res.end})

type: PERSON, start: 20, end: 24, score: 0.85
type: PERSON, start: 30, end: 42, score: 0.85
type: DATE_TIME, start: 61, end: 66, score: 0.85
type: PERSON, start: 171, end: 175, score: 0.85
type: PERSON, start: 376, end: 380, score: 0.85
type: PERSON, start: 467, end: 471, score: 0.85
type: PERSON, start: 545, end: 549, score: 0.85
type: PERSON, start: 735, end: 739, score: 0.85
type: PERSON, start: 1040, end: 1044, score: 0.85
type: PERSON, start: 1152, end: 1156, score: 0.85
type: PERSON, start: 1272, end: 1276, score: 0.85


In [18]:
# entities_info = [
#     {"type": "PERSON", "start": 61, "end": 68, "score": 0.85},
#     {"type": "PERSON", "start": 530, "end": 534, "score": 0.85},
#     {"type": "PERSON", "start": 567, "end": 571, "score": 0.85},
#     {"type": "DATE_TIME", "start": 1868, "end": 1872, "score": 0.85},
#     {"type": "LOCATION", "start": 2912, "end": 2918, "score": 0.85}
# ]

# Ensure entities are sorted by their start position
entities_info.sort(key=lambda x: x["start"])


def highlight_text(text, entities):
    highlighted_text = ""
    last_end = 0
    for entity in entities:
        start, end = entity['start'], entity['end']
        # Add the text up to the current entity
        highlighted_text += text[last_end:start]
        # Highlight the current entity
        highlighted_text += '\x1b[6;30;42m'+ f"[{text[start:end]}]" +'\x1b[0m'
        last_end = end
    # Add any remaining text after the last entity
    highlighted_text += text[last_end:]
    return highlighted_text


def highlight_preserving_format(text, entities):
    # Highlight the text
    highlighted_text = highlight_text(text, entities)
    
    # Split the highlighted text into lines to preserve original formatting
    highlighted_lines = highlighted_text.split('\n')
    
    # Join the lines back into a single string with newline characters
    return '\n'.join(highlighted_lines)

# Assume `text` is your long text string



# def generate_html(highlighted_text):
#     return f"""<!DOCTYPE html>
# <html>
# <head>
#     <title>Highlighted Entities</title>
#     <style>
#         mark {{
#             background-color: yellow;
#             color: black;
#         }}
#     </style>
# </head>
# <body>
#     {highlighted_text}
# </body>
# </html>
# """


# highlighted_text = highlight_entities(text_transcript, entities_info)

# Highlight the text
highlighted = highlight_preserving_format(text_transcript, entities_info)

# Printing the highlighted text
print(highlighted)

# html_output = generate_html(highlighted_text)
# with open("highlighted_entities.html", "w", encoding="utf-8") as file:
#     file.write(html_output)

# You can now display `html_output` in an HTML viewer, or write it to an HTML file.
# with open("highlighted_text.html", "w") as file:
#     file.write(html_output)


# html_output = f"""
# <html>
# <head>
#     <title>Highlighted Text</title>
#     <style>
#         mark {{
#             background-color: yellow;
#             color: black;
#         }}
#         body {{
#             white-space: pre-wrap; /* Respect text line breaks and spaces */
#         }}
#     </style>
# </head>
# <body>
#     <p>{highlighted_text}</p>
# </body>
# </html>
# """

# with open("highlighted_text.html", "w", encoding="utf-8") as file:
#     file.write(html_output)


**Tutor:** "Hello, [6;30;42m[Alex][0m! I'm [6;30;42m[Jionghao Lin][0m, your math tutor. [6;30;42m[Today][0m we're going to work on fractions, specifically adding and subtracting them. Are you ready?"

**Student ([6;30;42m[Alex][0m):** "Yes, I think so. Fractions are a bit confusing, though."

**Tutor:** "No worries, we'll tackle them step by step. Let's start with adding fractions. Do you remember what we need to do first?"

**[6;30;42m[Alex][0m:** "We need to make sure the denominators are the same, right?"

**Tutor:** "Exactly, [6;30;42m[Alex][0m! That's very important. If we have 1/4 + 3/4, what would be the sum?"

**[6;30;42m[Alex][0m:** "Since the denominators are the same, it would be 4/4, which is the same as one whole, right?"

**Tutor:** "Spot on! Now, what if we have different denominators, like 1/2 + 1/3?"

**[6;30;42m[Alex][0m:** "We need to find a common denominator, right? But I'm not sure how to do that."

**Tutor:** "Correct, we need a common denominato

In [19]:
results_english_replaced = analyzer.analyze(text=result.text, language="en", 
                        return_decision_process=True)
print(results_english_replaced)

[type: LOCATION, start: 20, end: 29, score: 0.85, type: PERSON, start: 35, end: 39, score: 0.85, type: LOCATION, start: 173, end: 182, score: 0.85, type: LOCATION, start: 479, end: 488, score: 0.85, type: DATE_TIME, start: 58, end: 68, score: 0.6]


In [20]:
entities_info_replaced = []

for res in results_english_replaced :
    print(res)
    entities_info_replaced .append({'type':res.entity_type, 'start':res.start, 'end':res.end})

type: LOCATION, start: 20, end: 29, score: 0.85
type: PERSON, start: 35, end: 39, score: 0.85
type: LOCATION, start: 173, end: 182, score: 0.85
type: LOCATION, start: 479, end: 488, score: 0.85
type: DATE_TIME, start: 58, end: 68, score: 0.6


In [21]:
# entities_info = [
#     {"type": "PERSON", "start": 61, "end": 68, "score": 0.85},
#     {"type": "PERSON", "start": 530, "end": 534, "score": 0.85},
#     {"type": "PERSON", "start": 567, "end": 571, "score": 0.85},
#     {"type": "DATE_TIME", "start": 1868, "end": 1872, "score": 0.85},
#     {"type": "LOCATION", "start": 2912, "end": 2918, "score": 0.85}
# ]

# Ensure entities are sorted by their start position
entities_info_replaced .sort(key=lambda x: x["start"])


def highlight_text(text, entities):
    highlighted_text = ""
    last_end = 0
    for entity in entities:
        start, end = entity['start'], entity['end']
        # Add the text up to the current entity
        highlighted_text += text[last_end:start]
        # Highlight the current entity
        highlighted_text += '\x1b[6;30;42m'+ f"[{text[start:end]}]" +'\x1b[0m'
        last_end = end
    # Add any remaining text after the last entity
    highlighted_text += text[last_end:]
    return highlighted_text


def highlight_preserving_format(text, entities):
    # Highlight the text
    highlighted_text = highlight_text(text, entities)
    
    # Split the highlighted text into lines to preserve original formatting
    highlighted_lines = highlighted_text.split('\n')
    
    # Join the lines back into a single string with newline characters
    return '\n'.join(highlighted_lines)

# Assume `text` is your long text string



# def generate_html(highlighted_text):
#     return f"""<!DOCTYPE html>
# <html>
# <head>
#     <title>Highlighted Entities</title>
#     <style>
#         mark {{
#             background-color: yellow;
#             color: black;
#         }}
#     </style>
# </head>
# <body>
#     {highlighted_text}
# </body>
# </html>
# """


# highlighted_text = highlight_entities(text_transcript, entities_info)

# Highlight the text
highlighted = highlight_preserving_format(result.text, entities_info_replaced)

# Printing the highlighted text
print(highlighted)

# html_output = generate_html(highlighted_text)
# with open("highlighted_entities.html", "w", encoding="utf-8") as file:
#     file.write(html_output)

# You can now display `html_output` in an HTML viewer, or write it to an HTML file.
# with open("highlighted_text.html", "w") as file:
#     file.write(html_output)


# html_output = f"""
# <html>
# <head>
#     <title>Highlighted Text</title>
#     <style>
#         mark {{
#             background-color: yellow;
#             color: black;
#         }}
#         body {{
#             white-space: pre-wrap; /* Respect text line breaks and spaces */
#         }}
#     </style>
# </head>
# <body>
#     <p>{highlighted_text}</p>
# </body>
# </html>
# """

# with open("highlighted_text.html", "w", encoding="utf-8") as file:
#     file.write(html_output)


**Tutor:** "Hello, [6;30;42m[Francisco][0m! I'm [6;30;42m[Mary][0m, your math tutor. [6;30;42m[11/23/2008][0m we're going to work on fractions, specifically adding and subtracting them. Are you ready?"

**Student ([6;30;42m[Francisco][0m):** "Yes, I think so. Fractions are a bit confusing, though."

**Tutor:** "No worries, we'll tackle them step by step. Let's start with adding fractions. Do you remember what we need to do first?"

**Francisco:** "We need to make sure the denominators are the same, right?"

**Tutor:** "Exactly, [6;30;42m[Francisco][0m! That's very important. If we have 1/4 + 3/4, what would be the sum?"

**Francisco:** "Since the denominators are the same, it would be 4/4, which is the same as one whole, right?"

**Tutor:** "Spot on! Now, what if we have different denominators, like 1/2 + 1/3?"

**Francisco:** "We need to find a common denominator, right? But I'm not sure how to do that."

**Tutor:** "Correct, we need a common denominator. For 1/2 and 1/3, t