In [92]:
import pandas as pd

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

df = read_file("obfuscated_data_06.json")

In [93]:
print(df.shape)
print(list(df.columns))
# print(df.iloc[80].full_text)

(22688, 5)
['full_text', 'document', 'tokens', 'trailing_whitespace', 'labels']


### Presidio Implementation Here

In [4]:
# !pip3 install presidio_analyzer
# !pip3 install presidio_anonymizer
# !pip3 install names
# !pip3 install names transformers

In [94]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy
import json
import re
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Create configuration containing engine name and models
def get_configuration(spaCy_model: str):
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": spaCy_model}],
    }

    return configuration

def get_conf_file(spaCy_model: str, transformer_model: str = None):
    snapshot_download(repo_id=transformer_model)
    # Instantiate to make sure it's downloaded during installation and not runtime
    AutoTokenizer.from_pretrained(transformer_model)
    AutoModelForTokenClassification.from_pretrained(transformer_model)

    config_dict = {
        "en_core_web_lg + obi/deid_roberta_i2b2": "Config/lg+roberta.yaml",
        "en_core_web_lg + StanfordAIMI/stanford-deidentifier-base": "Config/lg+stanford.yaml",
        "en_core_web_trf + obi/deid_roberta_i2b2": "Config/trf+roberta.yaml",
        "en_core_web_trf + StanfordAIMI/stanford-deidentifier-base": "Config/trf+stanford.yaml",
    }

    # Create configuration containing engine name and models
    conf_file = config_dict[spaCy_model + ' + ' + transformer_model]

    return conf_file

# Function to create NLP engine based on configuration
def create_nlp_engine(spaCy_model: str, transformer_model: str = None):
    if spaCy_model not in ["en_core_web_lg", "en_core_web_trf"]:
        raise ValueError("Input spaCy model is not supported.")
    if transformer_model is not None:
        if transformer_model not in ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"]:
            print(transformer_model)
            raise ValueError("Input transformer model is not supported.")
    
    # spaCy model only
    if transformer_model is None:
        configuration = get_configuration(spaCy_model)
        provider = NlpEngineProvider(nlp_configuration=configuration)

    # spaCy model with transformer
    else:
        conf_file = get_conf_file(spaCy_model, transformer_model)
        provider = NlpEngineProvider(conf_file=conf_file)
    
    nlp_engine = provider.create_engine()
    return nlp_engine

# Using only spaCy model
nlp_engine_spacy_only = create_nlp_engine(spaCy_model = "en_core_web_lg")

# Using spaCy model with an additional transformer model
# nlp_engine_with_transformer = create_nlp_engine(spaCy_model = "en_core_web_lg",
#                                                 transformer_model = "StanfordAIMI/stanford-deidentifier-base")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    # nlp_engine_spacy_only or nlp_engine_with_transformer
    nlp_engine = nlp_engine_spacy_only,
    # nlp_engine = nlp_engine_with_transformer,
    supported_languages=["en", "es"]
)

In [96]:
# Change tutor's and student's names to different fake names.
# !pip3 install faker
from faker import Faker

# Create an allow list to exclude words from being identified as PII
# allow_list = [
#     "Today",
#     "today",
#     "Yesterday",
#     "yesterday",
#     "Tomorrow",
#     "tomorrow"
# ]
    
def de_identify_pii(text_transcript):
    # Initialize the analyzer and anonymizer
    analyzer = AnalyzerEngine()
    anonymizer = AnonymizerEngine()

    # Define date range for generating random dates and generate a random date
    d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
    d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')
    random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')

    fake = Faker()

    # Function to generate a unique fake name
    def generate_fake_name(existing_names):
        while True:
            fake_name = names.get_first_name()
            if fake_name not in existing_names:
                return fake_name
    
    # Function to generate a unique fake email
    def generate_fake_email(fake_name):
        domains = ["gmail.com", "sina.com", "outlook.com"]
        return f"{fake_name.lower()}@{random.choice(domains)}"
    
    # Function to generate a unique fake location
    def generate_fake_location():
        return fake.city()  # Generate a fake city name using Faker

    # Function to generate a unique fake phone number
    def generate_fake_phone_number():
        return f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    
    # define entities that you want Presidio to detect
    entities = ["PERSON", "EMAIL_ADDRESS", "URL", "PHONE_NUMBER", "LOCATION"] # Age, gender ? DATE_TIME deleted

    # Analyze the text to find PII
    results_analyzed = analyzer.analyze(text=text_transcript, entities=entities, language="en", 
                                        return_decision_process=True, allow_list=None)
    
    # Create a mapping of original names to unique fake names
    name_mapping = {}
    existing_names = set()
    for result in results_analyzed:
        if result.entity_type == "PERSON":
            original_name = text_transcript[result.start:result.end]
            if original_name not in name_mapping:
                fake_name = generate_fake_name(existing_names)
                name_mapping[original_name] = fake_name
                existing_names.add(fake_name)

    # Email mapping to ensure consistent fake emails
    email_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "EMAIL_ADDRESS":
            original_email = text_transcript[result.start:result.end]
            if original_email not in email_mapping:
                fake_name = generate_fake_name(existing_names)
                fake_email = generate_fake_email(fake_name)
                email_mapping[original_email] = fake_email
    
    # Phone number mapping to ensure consistent fake phone numbers
    phone_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "PHONE_NUMBER":
            original_phone = text_transcript[result.start:result.end]
            if original_phone not in phone_mapping:
                fake_phone = generate_fake_phone_number()
                phone_mapping[original_phone] = fake_phone

    operators = {
        "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
        "DATE_TIME": OperatorConfig("replace", {"new_value": random_date}),
        # Add more categories
        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda text: email_mapping.get(text, text)}),
        "LOCATION": OperatorConfig("replace", {"new_value": generate_fake_location()}),
        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda text: phone_mapping.get(text, text)})
    }

    # Anonymize the text
    results_anonymized = anonymizer.anonymize(
        text=text_transcript,
        analyzer_results=results_analyzed,
        operators=operators
    )

    return results_analyzed, results_anonymized


In [72]:
# Random eyeballing some transcripts
# 15616, 13965, 20229, 4351, 19110 (ID_NUM), 21988 (ID_NUM)
# input_idx = 21988
# input_text = df.iloc[input_idx].full_text

In [73]:
# results_analyzed, results_anonymized = de_identify_pii(input_text)

In [None]:
# Print results_analyzed and results_anonymized
# results_analyzed is a list
# print(results_analyzed)
# for res in results_analyzed:
#     print(res)
# print("-----------------------------------------------------------------------------------------")
# print(results_anonymized)

In [None]:
# for res in results_analyzed:
#     print(f"PII: {input_text[res.start:res.end]} ---- start: {res.start} ---- end: {res.end} ---- type: {res.entity_type}")

In [83]:
# from typing import List, Tuple

# # Define the type alias for PII entity
# type pii_entity = Tuple[int, str, str, Tuple[int, int]]

# def remove_overlapping_entities(entities: List[pii_entity]) -> List[pii_entity]:
#     # Sort by essay index, start position, and length (descending)
#     entities = sorted(entities, key=lambda x: (x[0], x[3][0], -(x[3][1] - x[3][0])))
#     filtered_entities: List[pii_entity] = []
#     last_end = -1
#     last_index = -1

#     for entity in entities:
#         input_idx, entity_text, entity_type, (start, end) = entity
        
#         # Remove entities that overlap with previously accepted entities in the same essay
#         if input_idx != last_index or start >= last_end:
#             filtered_entities.append(entity)
#             last_end = end
#             last_index = input_idx
#         else:
#             # Check if the overlapping entity is of higher priority
#             if filtered_entities and start < filtered_entities[-1][3][1]:
#                 if end - start > filtered_entities[-1][3][1] - filtered_entities[-1][3][0]:
#                     filtered_entities[-1] = entity
#                 elif end - start == filtered_entities[-1][3][1] - filtered_entities[-1][3][0]:
#                     if entity_type == "EMAIL_ADDRESS" and filtered_entities[-1][2] == "URL":
#                         filtered_entities[-1] = entity

#     return filtered_entities

# # Function to highlight text
# # def highlight_text(text, locations):
# #     highlighted_text = ""
# #     last_end = 0
    
# #     # for start, end, _ in sorted(locations, key=lambda x: x[0]):
# #     for start, end, _ in locations:
# #         highlighted_text += text[last_end:start]
# #         highlighted_text += f'\x1b[6;30;42m{text[start:end]}\x1b[0m'
# #         last_end = end
    
# #     highlighted_text += text[last_end:]
# #     return highlighted_text

# # Extract positions and types from Presidio results
# # def extract_positions(results):
# #     positions = []
# #     for res in results:
# #         # positions.append((res.start, res.end, res.entity_type))
# #         positions.append((input_idx, input_text[res.start:res.end], res.entity_type, (res.start, res.end)))
# #     return positions


In [84]:
# # Extract positions for highlighting
# positions_analyzed = extract_positions(results_analyzed)
# positions_anonymized = extract_positions(results_anonymized.items)

# print("positions_analyzed (before removing overlaps):")
# for pos in positions_analyzed:
#     # print(f"PII: {input_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")
#     print(pos)
# print(len(positions_analyzed))
# # Remove overlapping entities from the positions
# positions_analyzed = remove_overlapping_entities(positions_analyzed)
# positions_anonymized = remove_overlapping_entities(positions_anonymized)

# print("-----------------------------------------------------------------------------------------")
# print("positions_analyzed (after removing overlaps) and sorted:")
# for pos in positions_analyzed:
#     # print(f"PII: {input_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")
#     print(pos)
# print(len(positions_analyzed))

positions_analyzed (before removing overlaps):
(21988, 'Anisa Hussain - 8743214\n\nNon-Business Use\n\nVisualization\n\nChallenge & Selection', 'PERSON', (33, 112))
(21988, 'David Gray’s', 'PERSON', (1356, 1368))
(21988, '30 minutes', 'DATE_TIME', (2454, 2464))
(21988, 'Anisa Hussain - 8743214\n\nNon-Business Use\n\nInsight & Approach', 'PERSON', (3130, 3191))
4
-----------------------------------------------------------------------------------------
positions_analyzed (after removing overlaps) and sorted:
(21988, 'Anisa Hussain - 8743214\n\nNon-Business Use\n\nVisualization\n\nChallenge & Selection', 'PERSON', (33, 112))
(21988, 'David Gray’s', 'PERSON', (1356, 1368))
(21988, '30 minutes', 'DATE_TIME', (2454, 2464))
(21988, 'Anisa Hussain - 8743214\n\nNon-Business Use\n\nInsight & Approach', 'PERSON', (3130, 3191))
4


In [98]:
from typing import List, Tuple

# Define the type alias for PII entity
type pii_entity = Tuple[int, str, str, Tuple[int, int]]

# Function to analyze text with Presidio and return PII entities
def analyze_texts_with_presidio(df: pd.DataFrame) -> List[pii_entity]:
    pii_entities: List[pii_entity] = []
    
    for i, row in df.iterrows():
        print(f"Precessing Row {i} ----------")
        text = row.full_text
        results_analyzed, results_anonymized = de_identify_pii(text)
        
        for result in results_analyzed:
            start = result.start
            end = result.end  # Presidio's end index is exclusive
            entity_text = text[start:end]
            pii_entities.append((i, entity_text, result.entity_type, (start, end)))
    
    return pii_entities

# Function to save PII entities to a file
def save_entities_to_file(entities: List[pii_entity], file_path: str, indent: int = 4):
    with open(file_path, 'w') as f:
        for entity in entities:
            indent_space = ' ' * indent
            entity_str = f"{entity}\n"
            f.write(indent_space + entity_str)

# Get PII entities for all rows in the dataframe
pii_entities_detected = analyze_texts_with_presidio(df[:500])

# Save the detected PII entities to a file
output_file = "pii_entities_detected.txt"
save_entities_to_file(pii_entities_detected, output_file)

print(f"Detected PII entities saved to {output_file}")

Precessing Row 0 ----------
Precessing Row 1 ----------
Precessing Row 2 ----------
Precessing Row 3 ----------
Precessing Row 4 ----------
Precessing Row 5 ----------
Precessing Row 6 ----------
Precessing Row 7 ----------
Precessing Row 8 ----------
Precessing Row 9 ----------
Precessing Row 10 ----------
Precessing Row 11 ----------
Precessing Row 12 ----------
Precessing Row 13 ----------
Precessing Row 14 ----------
Precessing Row 15 ----------
Precessing Row 16 ----------
Precessing Row 17 ----------
Precessing Row 18 ----------
Precessing Row 19 ----------
Precessing Row 20 ----------
Precessing Row 21 ----------
Precessing Row 22 ----------
Precessing Row 23 ----------
Precessing Row 24 ----------
Precessing Row 25 ----------
Precessing Row 26 ----------
Precessing Row 27 ----------
Precessing Row 28 ----------
Precessing Row 29 ----------
Precessing Row 30 ----------
Precessing Row 31 ----------
Precessing Row 32 ----------
Precessing Row 33 ----------
Precessing Row 34 ------