In [17]:
# !pip install presidio_analyzer
# !pip install presidio-anonymizer
# !pip install names
# from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
# from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
# from presidio_anonymizer import AnonymizerEngine
# from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
# import names
# import random
# from random import randrange
# from datetime import timedelta, datetime
# import spacy
# import json
# import re

# # Create configuration containing engine name and models
# def get_configuration(input_model_name : str):
#     configuration = {
#         "nlp_engine_name": "spacy",
#         "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
#                     {"lang_code": "en", "model_name": input_model_name}],
#     }
#     return configuration

# # Create NLP engine based on configuration
# # Possible input_model_name: en_core_web_trf, en_core_web_lg
# provider = NlpEngineProvider(nlp_configuration=get_configuration("en_core_web_trf"))
# nlp_engine_with_spanish = provider.create_engine()

# # Pass the created NLP engine and supported_languages to the AnalyzerEngine
# analyzer = AnalyzerEngine(
#     nlp_engine=nlp_engine_with_spanish, 
#     supported_languages=["en", "es"]
# )

In [1]:
# Install required packages if not already installed
# Change ! to % if you are using VSCode
# !pip install presidio_analyzer
# !pip install "presidio_analyzer[transformers]"
# !pip install presidio-anonymizer
# !pip install names
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy
import json
import re
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Choose one:
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_trf

# Create configuration containing engine name and models
def get_configuration(spaCy_model: str):
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": spaCy_model}],
    }

    return configuration

def get_conf_file(spaCy_model: str, transformer_model: str = None):
    snapshot_download(repo_id=transformer_model)
    # Instantiate to make sure it's downloaded during installation and not runtime
    AutoTokenizer.from_pretrained(transformer_model)
    AutoModelForTokenClassification.from_pretrained(transformer_model)

    config_dict = {
        "en_core_web_lg + obi/deid_roberta_i2b2": "Config/lg+roberta.yaml",
        "en_core_web_lg + StanfordAIMI/stanford-deidentifier-base": "Config/lg+stanford.yaml",
        "en_core_web_trf + obi/deid_roberta_i2b2": "Config/trf+roberta.yaml",
        "en_core_web_trf + StanfordAIMI/stanford-deidentifier-base": "Config/trf+stanford.yaml",
    }

    # Create configuration containing engine name and models
    conf_file = config_dict[spaCy_model + ' + ' + transformer_model]

    return conf_file

# Function to create NLP engine based on configuration
def create_nlp_engine(spaCy_model: str, transformer_model: str = None):
    if spaCy_model not in ["en_core_web_lg", "en_core_web_trf"]:
        raise ValueError("Input spaCy model is not supported.")
    if transformer_model is not None:
        if transformer_model not in ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"]:
            print(transformer_model)
            raise ValueError("Input transformer model is not supported.")
    
    # spaCy model only
    if transformer_model is None:
        configuration = get_configuration(spaCy_model)
        provider = NlpEngineProvider(nlp_configuration=configuration)

    # spaCy model with transformer
    else:
        conf_file = get_conf_file(spaCy_model, transformer_model)
        provider = NlpEngineProvider(conf_file=conf_file)
    
    nlp_engine = provider.create_engine()
    return nlp_engine


# Possible spaCy_model: "en_core_web_lg", "en_core_web_trf"
# Possible transformer_model: "obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"

# Example usage:
# Using only spaCy model
nlp_engine_spacy_only = create_nlp_engine(spaCy_model = "en_core_web_lg")

# Using spaCy model with an additional transformer model
# nlp_engine_with_transformer = create_nlp_engine(spaCy_model = "en_core_web_lg",
#                                                 transformer_model = "obi/deid_roberta_i2b2")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine = nlp_engine_spacy_only, # nlp_engine_spacy_only or nlp_engine_with_transformer
    supported_languages=["en", "es"]
)
# Note for future work:
# def anonymizer(transcripts, model_name, tokenizer)
# def anonymizer(transcripts, model_name, tokenizer, flag)
# return redacted_text, position_index, entity_name

In [9]:
# If you use a transformer model.
import openai
from presidio_analyzer import EntityRecognizer, RecognizerResult

# Set your OpenAI API key
openai.api_key = 'your-api-key'

class OpenAIRecognizer(EntityRecognizer):
    def __init__(self):
        super().__init__(supported_entities=["PERSON", "LOCATION", "ORGANIZATION", "DATE", "TIME", "MONEY", "EMAIL", 
                                             "PHONE", "GPE", "TITLE", "MISC", "ID", "AGE"], supported_language="en")

    def load(self):
        pass

    def get_supported_entities(self):
        return self.supported_entities

    def analyze(self, text, entities, nlp_artifacts=None):
        example_text = "John Doe visited the hospital on January 3rd, 2020. His email is john.doe@example.com."
        example_output = {
            "entities": [
                {"entity_type": "PERSON", "start": 0, "end": 8, "entity_text": "John Doe"},
                {"entity_type": "DATE", "start": 33, "end": 50, "entity_text": "January 3rd, 2020"},
                {"entity_type": "EMAIL", "start": 65, "end": 85, "entity_text": "john.doe@example.com"}
            ]
        }

        messages = [
            {"role": "system", "content": "You are a helpful assistant that identifies PII in text and returns the result in JSON format."},
            {"role": "user", "content": "I have some text that might contain sensitive information."},
            {"role": "assistant", "content": "Sure, I can help with that. Please provide the text."},
            {"role": "user", "content": f"Here is an example text: {example_text}"},
            {"role": "assistant", "content": f"The JSON output for the example text is: {example_output}. Notice that the start and end index is exactly where you would extract the PII using string slicing in python, i.e., Text[Start:End]."},
            {"role": "user", "content": f"PII can include any kind of names, date and time, email address, location, address, phone number, links, and anything that you considered having a risk of exposing one's private information. Now, please identify PII in the following text and provide the result in the same JSON format. Text: {text}"}
        ]

        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            # model="gpt-4o",
            messages=messages,
            max_tokens=500
        )

        response_text = response.choices[0].message['content'].strip()
        response_json = json.loads(response_text)

        results = []
        for entity in response_json["entities"]:
            results.append(RecognizerResult(
                entity_type=entity["entity_type"],
                start=entity["start"],
                end=entity["end"],
                score=0.85  # Placeholder confidence score
            ))

        return results

# Example usage
text = "Jane Doe visited the school on March 5th, 2021. Her phone number is 123-456-7890."
recognizer = OpenAIRecognizer()
results = recognizer.analyze(text, None)

for result in results:
    print(f"Entity: {result.entity_type}, Text: {text[result.start:result.end]}, Start: {result.start}, End: {result.end}")


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [21]:
# Load the processed_transcripts here
with open('processed_transcripts.txt', 'r') as f:
    text_transcripts = json.load(f)

# Verify the content
print(len(text_transcripts))

260


In [22]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

# Create the recognizer registry and add the custom recognizer
registry = RecognizerRegistry()
registry.add_recognizer(OpenAIRecognizer())

# Create the AnalyzerEngine with the custom recognizer
analyzer = AnalyzerEngine(registry=registry)

# Example usage with AnalyzerEngine
text = text_transcripts[0] # I only run the first example
results = analyzer.analyze(text=text, language="en")

for result in results:
    print(f"Entity: {result.entity_type}, Text: {text[result.start:result.end]}, Start: {result.start}, End: {result.end}")


['1. `PERSON_NAME', '12', '24', 'Emma Johnson`']
['2. `PERSON_NAME', '952', '964', 'Emma Johnson`']
['3. `PERSON_NAME', '1741', '1753', 'Emma Johnson`']
['4. `PERSON_NAME', '2454', '2466', 'Emma Johnson`']
['5. `PERSON_NAME', '3246', '3258', 'Emma Johnson`']
['6. `PERSON_NAME', '3710', '3722', 'Emma Johnson`']
Entity: 1. `PERSON_NAME, Text: Emma Johnson, Start: 12, End: 24
Entity: 2. `PERSON_NAME, Text:  kind of und, Start: 952, End: 964
Entity: 3. `PERSON_NAME, Text:  can you ple, Start: 1741, End: 1753
Entity: 4. `PERSON_NAME, Text: K this is wh, Start: 2454, End: 2466
Entity: 5. `PERSON_NAME, Text: e Sahara is , Start: 3246, End: 3258
Entity: 6. `PERSON_NAME, Text: n't know muc, Start: 3710, End: 3722


In [None]:
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
from azure.ai.textanalytics import TextAnalyticsClient, AzureKeyCredential, TextAnalyticsApiKeyCredential
from azure.core.credentials import AzureKeyCredential

class AzureNlpEngine:
    def __init__(self, endpoint, api_key):
        self.client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

    def process_text(self, text, language):
        documents = [{"id": "1", "language": language, "text": text}]
        response = self.client.recognize_pii_entities(documents)
        result = response[0]

        entities = []
        for entity in result.entities:
            entities.append({
                "start": entity.offset,
                "end": entity.offset + entity.length,
                "entity_type": entity.category
            })
        return entities

# Example usage
endpoint = "YOUR_AZURE_ENDPOINT"
api_key = "YOUR_AZURE_API_KEY"
azure_engine = AzureNlpEngine(endpoint, api_key)

text = "John Doe visited the hospital on January 3rd, 2020."
entities = azure_engine.process_text(text, language="en")

for entity in entities:
    print(f"Entity: {entity['entity_type']}, Text: {text[entity['start']:entity['end']]}, Start: {entity['start']}, End: {entity['end']}")


In [None]:
# Recognizer for Azure AI Language
class AzureRecognizer(EntityRecognizer):
    def __init__(self, azure_engine):
        super().__init__(supported_entities=["PERSON", "LOCATION", "ORGANIZATION", "DATE", "TIME", "MONEY", "EMAIL", 
                                             "PHONE", "GPE", "TITLE", "MISC", "ID", "AGE"], supported_language="en")
        self.azure_engine = azure_engine

    def load(self):
        pass

    def get_supported_entities(self):
        return self.supported_entities

    def analyze(self, text, entities, nlp_artifacts=None):
        azure_entities = self.azure_engine.process_text(text, language="en")

        results = []
        for azure_entity in azure_entities:
            results.append(RecognizerResult(
                entity_type=azure_entity["entity_type"],
                start=azure_entity["start"],
                end=azure_entity["end"],
                score=0.85  # Placeholder confidence score
            ))
        return results

# Initialize the Azure NLP engine
azure_engine = AzureNlpEngine(endpoint="YOUR_AZURE_ENDPOINT", api_key="YOUR_AZURE_API_KEY")

# Create the recognizer registry and add the Azure recognizer
registry = RecognizerRegistry()
registry.add_recognizer(AzureRecognizer(azure_engine))

# Create the AnalyzerEngine with the custom recognizer
analyzer = AnalyzerEngine(registry=registry)

# Example usage with AnalyzerEngine
text = "John Doe visited the hospital on January 3rd, 2020."
results = analyzer.analyze(text=text, language="en")

for result in results:
    print(f"Entity: {result.entity_type}, Text: {text[result.start:result.end]}, Start: {result.start}, End: {result.end}")


In [20]:
# Updated code: change tutor's and student's names to different fake names.
# !pip install faker
from faker import Faker

def de_identify_pii(text_transcript):
    # Initialize the analyzer and anonymizer
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    # Define date range for generating random dates and generate a random date
    d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
    d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')
    random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')

    fake = Faker()

    # Function to generate a unique fake name
    def generate_fake_name(existing_names):
        while True:
            fake_name = names.get_first_name()
            if fake_name not in existing_names:
                return fake_name
    
    # Function to generate a unique fake email
    def generate_fake_email(fake_name):
        domains = ["gmail.com", "sina.com", "outlook.com"]
        return f"{fake_name.lower()}@{random.choice(domains)}"
    
    # Function to generate a unique fake location
    def generate_fake_location():
        return fake.city()  # Generate a fake city name using Faker

    # Function to generate a unique fake phone number
    def generate_fake_phone_number():
        return f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    
    # Create an allow list to exclude words from being identified as PII
    allow_list = [
        "Today",
        "today",
        "Yesterday",
        "yesterday",
        "Tomorrow",
        "tomorrow"
    ]

    # Analyze the text to find PII
    results_english = analyzer.analyze(text=text_transcript, language="en", return_decision_process=True, allow_list=allow_list)
    # <PERSON>, <DATE_TIME>, ...
    
    # Create a mapping of original names to unique fake names
    # Is creating a mapping good idea? User can easily print out the name_mapping dictionary and see the PII...
    # So we cannot release code... IRB will handle this.
    name_mapping = {}
    existing_names = set()
    for result in results_english:
        if result.entity_type == "PERSON":
            original_name = text_transcript[result.start:result.end]
            if original_name not in name_mapping:
                fake_name = generate_fake_name(existing_names)
                name_mapping[original_name] = fake_name
                existing_names.add(fake_name)

    # print(name_mapping)

    # Email mapping to ensure consistent fake emails
    email_mapping = {}
    for result in results_english:
        if result.entity_type == "EMAIL_ADDRESS":
            original_email = text_transcript[result.start:result.end]
            if original_email not in email_mapping:
                fake_name = generate_fake_name(existing_names)
                fake_email = generate_fake_email(fake_name)
                email_mapping[original_email] = fake_email
    
    # Phone number mapping to ensure consistent fake phone numbers
    phone_mapping = {}
    for result in results_english:
        if result.entity_type == "PHONE_NUMBER":
            original_phone = text_transcript[result.start:result.end]
            if original_phone not in phone_mapping:
                fake_phone = generate_fake_phone_number()
                phone_mapping[original_phone] = fake_phone

    operators = {
        "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
        "DATE_TIME": OperatorConfig("replace", {"new_value": random_date}),
        # Add more categories
        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda text: email_mapping.get(text, text)}),
        "LOCATION": OperatorConfig("replace", {"new_value": generate_fake_location()}),
        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda text: phone_mapping.get(text, text)})
    }

    # Anonymize the text
    result = anonymizer.anonymize(
        text=text_transcript,
        analyzer_results=results_english,
        operators=operators
    )

    return result, results_english


In [21]:
# def highlight_text(text, entities):
#     highlighted_text = ""
#     last_end = 0
#     for entity in entities:
#         start, end = entity['start'], entity['end']
#         # Add the text up to the current entity
#         highlighted_text += text[last_end:start]
#         # Highlight the current entity
#         highlighted_text += '\x1b[6;30;42m'+ f"[{text[start:end]}]" +'\x1b[0m'
#         last_end = end
#     # Add any remaining text after the last entity
#     highlighted_text += text[last_end:]
#     return highlighted_text

def highlight_text(text, entities):
    highlighted_text = ""
    last_end = 0
    
    # Sort entities by their start position
    entities = sorted(entities, key=lambda x: x['start'])
    
    for entity in entities:
        start, end = entity['start'], entity['end']
        if start >= last_end:  # Ensure there's no overlap
            # Add the text up to the current entity
            highlighted_text += text[last_end:start]
            # Highlight the current entity
            highlighted_text += '\x1b[6;30;42m' + f"[{text[start:end]}]" + '\x1b[0m'
            last_end = end
    
    # Add any remaining text after the last entity
    highlighted_text += text[last_end:]
    return highlighted_text

def highlight_preserving_format(text, entities):
    # Highlight the text
    highlighted_text = highlight_text(text, entities)
    
    # Split the highlighted text into lines to preserve original formatting
    highlighted_lines = highlighted_text.split('\n')
    
    # Join the lines back into a single string with newline characters
    return '\n'.join(highlighted_lines)

def highlight_orig_text(text_transcript, results_english):
    entities_info = []

    for res in results_english:
        entities_info.append({'type':res.entity_type, 'start':res.start, 'end':res.end})

    # Ensure entities are sorted by their start position
    entities_info.sort(key=lambda x: x["start"])

    # Highlight the text
    highlighted_orig = highlight_preserving_format(text_transcript, entities_info)

    # Printing the highlighted text
    return highlighted_orig

def hightlight_replaced_text(result):
    results_english_replaced = analyzer.analyze(text=result.text, language="en", 
                        return_decision_process=True)

    entities_info_replaced = []

    for res in results_english_replaced:
        # print(res)
        entities_info_replaced.append({'type':res.entity_type, 'start':res.start, 'end':res.end})

    # Ensure entities are sorted by their start position
    entities_info_replaced.sort(key=lambda x: x["start"])

    # Highlight the text
    highlighted_replaced = highlight_preserving_format(result.text, entities_info_replaced)

    highlighted_replaced = re.sub(r'(<[^>]+>)', r'\033[42;30m\1\033[0m', highlighted_replaced)

    # Printing the highlighted text
    return highlighted_replaced

def insert_space(s, i):
    return s[:i] + ' ' + s[i:]

def remove_possible_url(s):
    spaces = []
    for i in range(len(s)-1):
        if s[i] == '.' and s[i+1].isalpha():
            spaces.append(i+1)

    for space in reversed(spaces):  # reversed to not mess up indices
        s = insert_space(s, space)

    return s

In [22]:
def run_all(text_transcripts):
    for text_transcript in text_transcripts[:5]:
        text_transcript = remove_possible_url(text_transcript)
        result, results_english = de_identify_pii(text_transcript)
        highlighted_orig = highlight_orig_text(text_transcript, results_english)
        print(highlighted_orig)
        print("----------------------------------------------------------------------------------------------------------------------------")
        highlighted_replaced = hightlight_replaced_text(result)
        print(highlighted_replaced)
        print("----------------------------------------------------------------------------------------------------------------------------")
        print("----------------------------------------------------------------------------------------------------------------------------")

run_all(text_transcripts)

teacher: Hi [6;30;42m[Emma Johnson][0m, hope I didn't get you up too early!
student: Don't worry, my exam is [6;30;42m[next Saturday][0m, so I should get up early.
teacher: Ah OK, so good practice then... is that an IELTS exam?
student: Exactly.
teacher: I've lost track of how many you've done
student: I hope I can do as well as usual at the real exam. haha
teacher: Do you mean get a score you get in practice tests?
student: Yes, I managed to get 37 on reading [6;30;42m[yesterday][0m!!
teacher: Wow that's really good - well done... you've definitely made a lot of progress. And what's your normal score on listening [6;30;42m[these days][0m?
student: Around 30, the listening is harder for me.
teacher: OK that's quite common to be better at reading - but 30 is still more than respectable. I guess maybe speaking is the toughest part?
student: Yeah, because it's difficult to practice...
teacher: Yeah sure - I guess what we're doing right now is a weird mixture of speaking and writin

student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam is 02/22/2008, s...'
student: Don't worry, my exam i

teacher: Hi Myrl, hope I didn't get you up too early!
student: Don't worry, my exam is [6;30;42m[02/22/2008][0m, so I should get up early.
teacher: Ah OK, so good practice then... is that an IELTS exam?
student: Exactly.
teacher: I've lost track of how many you've done
student: I hope I can do as well as usual at the real exam. haha
teacher: Do you mean get a score you get in practice tests?
student: Yes, I managed to get 37 on reading [6;30;42m[02/22/2008][0m!!
teacher: Wow that's really good - well done... you've [6;30;42m[definitely][0m made a lot of progress. And what's your normal score on listening [6;30;42m[02/22/2008][0m?
student: Around 30, the listening is harder for me.
teacher: OK that's quite common to be better at reading - but 30 is still more than respectable. I guess maybe speaking is the toughest part?
student: Yeah, because it's difficult to practice...
teacher: Yeah sure - I guess what we're doing right now is a weird mixture of speaking and writing.... beca

student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student: Yes, here I am :)
teacher: Hi there - good t...'
student: Hi
student:

teacher: Hi [6;30;42m[Andy][0m - are you there?!
student: Hi
student: Yes, here I am :)
teacher: Hi there - good to speak to you!
teacher: [6;30;42m[Andy][0m - can you tell me something about yourself - whatever you like...
student: ok
student: I'm from [6;30;42m[Lake][0m [6;30;42m[Barbara][0m, from [6;30;42m[Lake][0m [6;30;42m[Barbara][0m. I came back home [6;30;42m[06/28/2008][0m and I'm very happy to see the sun again.
teacher: OK yes I can imagine! It's pretty cold here in [6;30;42m[Lake][0m [6;30;42m[Barbara][0m at the moment (sunny though!). Do you travel to the  [6;30;42m[Lake][0m [6;30;42m[Barbara][0m often?
student: At the moment I'm not working so I have a lot of free time to go for walks outside and enjoy the good weather
teacher: What's your normal work?
student: About traveling to the [6;30;42m[Lake][0m [6;30;42m[Barbara][0m, I don't go often. I lived there for [6;30;42m[06/28/2008][0m until [6;30;42m[06/28/2008][0m and then I went again at [

student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak ...'
student: Hello. Yes,

teacher: Hi [6;30;42m[Duane][0m, are you there?!
student: Hello. Yes, I'm here.
teacher: Hi there - good to speak to you again. [6;30;42m[Duane][0m, I thought I'd make this lesson a bit more focused so here goes! Can i ask you to write a sentnece using the following information: Pacific/biggest
student: Tinaside is the biggest ocean in the world.
teacher: Thank you - another one: Tinaside/longest
student: The Tinaside is the longest river in the world.
teacher: OK nearly there! Everest/Tinaside
student: Tinaside is the highest in the Tinaside.
teacher: Thanks - USA/50 states
student: The Tinaside has 50 states.
teacher: OK last one: London/south/ Tinaside/north
student: Tinaside is in the south of the Tinaside while Tinaside is in the north.
teacher: OK [6;30;42m[Duane][0m thanks a lot for that - I was checking your use of articles with geographical features etc as you might have guessed  - you've passed with flying colours! I'll think of some more stuff to challenge you! In the 

student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'
student: Hi Ms. Ronald, yeah I've just arrived home.
teacher...'


teacher: Hi [6;30;42m[Michele][0m - you made it home!
student: Hi Ms. [6;30;42m[Ronald][0m, yeah I've just arrived home.
teacher: I'm jealous... sometimes it takes me [6;30;42m[06/18/2008][0m to drive home in the rush hour...
student: Oh, I'm sorry for you. I can walk to school in [6;30;42m[06/18/2008][0m, haha
teacher: Yes you're 'just round the corner'... [6;30;42m[Michele][0m, I've planned to ask you to do some stuff on [6;30;42m[vocabulary][0m [6;30;42m[06/18/2008][0m - words to describe people's personality. Can you give me one or two(!) to describe yourself?
student: I'm lazy and selfish, haha
teacher: Really?! Maybe most of us are sometimes.... any positive ones?!
student: Ah, I'm sometimes told I'm mature for my age.
teacher: OK, you mean people sometimes say you're mature?
student: Yeah.
teacher: OK right - that's good I guess! You can say: people say.... OR it's sometimes said (that) I'm.... OR 'I'm said to be mature' - the last two are more formal...
teacher: 

student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Darius
student: I can see more people here, is it...'
student: Hi Ms. Dari

In [23]:
from transformers import pipeline
import spacy

# Load spaCy and transformer model
nlp = spacy.load("en_core_web_lg")
transformer_ner = pipeline("ner", model="obi/deid_roberta_i2b2")

# Example text
text = text_transcripts[0]

# Run spaCy tokenizer
doc = nlp(text)

# Run transformer NER
entities = transformer_ner(text)

# Debugging entity spans and token alignment
for ent in entities:
    # print(ent)
    start, end, label, score = ent['start'], ent['end'], ent['entity'], ent['score']
    span = doc.char_span(start, end, alignment_mode="contract")

    if span is None:
        print(f"Skipping annotation, {ent} is overlapping or can't be aligned for doc '{text}'")
    else:
        print(f"Entity: {label}, Text: {span.text}, Start: {start}, End: {end}, Score: {score}")

# Process with Presidio
results = analyzer.analyze(text=text, language="en")
for result in results:
    print(f"Entity: {result.entity_type}, Text: {text[result.start:result.end]}, Start: {result.start}, End: {result.end}")


Entity: B-PATIENT, Text: Emma, Start: 12, End: 16, Score: 0.9982592463493347
Entity: L-PATIENT, Text: Johnson, Start: 17, End: 24, Score: 0.9992827773094177
Entity: U-DATE, Text: Saturday, Start: 100, End: 108, Score: 0.9993002414703369
Entity: B-PATIENT, Text: Emma, Start: 1433, End: 1437, Score: 0.9917696714401245
Entity: L-PATIENT, Text: Johnson, Start: 1438, End: 1445, Score: 0.9948838353157043
Entity: U-LOC, Text: Tokyo, Start: 1558, End: 1563, Score: 0.9988057613372803
Entity: U-LOC, Text: Japan, Start: 1564, End: 1569, Score: 0.9953567385673523
Entity: U-LOC, Text: Tokyo, Start: 1808, End: 1813, Score: 0.9810948371887207
Entity: U-LOC, Text: Japan, Start: 1837, End: 1842, Score: 0.9997360110282898
Entity: U-LOC, Text: Japan, Start: 1898, End: 1903, Score: 0.9996516704559326
Skipping annotation, {'entity': 'U-STAFF', 'score': 0.532796, 'index': 510, 'word': 'Tok', 'start': 1957, 'end': 1960} is overlapping or can't be aligned for doc 'teacher: Hi Emma Johnson, hope I didn't get y

student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
student: Don't worry, my exam is next ...'
