In [9]:
import pandas as pd

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

df = read_file("data/obfuscated_data_06.json")

In [10]:
print(df.shape)
print(list(df.columns))
print(df.iloc[80].tokens)

(22688, 5)
['full_text', 'document', 'tokens', 'trailing_whitespace', 'labels']
['1', 'Karol', 'Ferreira', '\n\n', 'REFLECTION', '-', 'VISUALIZATION', '\n\n', 'Challenge', '\n\n', 'Working', 'in', 'the', 'private', 'sector', 'one', 'of', 'the', 'biggest', 'challenges', 'is', 'to', 'sell', 'the', 'idea', 'to', ' ', 'potential', 'clients', 'or', 'financier', 'and', 'it', 'is', 'even', 'bigger', 'challenge', 'when', 'the', 'concept', 'is', 'new', ' ', 'in', 'the', 'country', 'and', 'there', 'is', 'no', 'prototypes', '.', '\n\n', 'Having', 'been', 'trained', 'in', 'Tourism', 'and', 'working', 'in', 'an', 'IT', 'company', ',', 'I', 'had', 'contact', 'with', ' ', 'those', 'two', 'sectors', ',', 'which', 'allowed', 'me', 'to', 'realize', 'that', 'despite', 'the', 'high', 'tourist', 'demand', ',', ' ', 'small', 'tourism', ' ', 'operators', 'in', 'Cape', 'Verde', 'had', 'great', 'difficulties', 'in', 'selling', 'their', 'services', ' ', 'because', 'most', 'were', 'not', 'online', '.', 'To', 'fi

### Presidio Implementation Here

In [4]:
# !pip3 install presidio_analyzer
# !pip3 install presidio_anonymizer
# !pip3 install names
# !pip3 install names transformers

In [38]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider, TransformersNlpEngine, NerModelConfiguration
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
import names
import random
from random import randrange
from datetime import timedelta, datetime
import spacy
import json
import re
import transformers
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Create configuration containing engine name and models
def get_configuration(spaCy_model: str):
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": spaCy_model}],
    }

    return configuration

def get_conf_file(spaCy_model: str, transformer_model: str = None):
    snapshot_download(repo_id=transformer_model)
    # Instantiate to make sure it's downloaded during installation and not runtime
    AutoTokenizer.from_pretrained(transformer_model)
    AutoModelForTokenClassification.from_pretrained(transformer_model)

    config_dict = {
        "en_core_web_lg + obi/deid_roberta_i2b2": "Config/lg+roberta.yaml",
        "en_core_web_lg + StanfordAIMI/stanford-deidentifier-base": "Config/lg+stanford.yaml",
        "en_core_web_trf + obi/deid_roberta_i2b2": "Config/trf+roberta.yaml",
        "en_core_web_trf + StanfordAIMI/stanford-deidentifier-base": "Config/trf+stanford.yaml",
    }

    # Create configuration containing engine name and models
    conf_file = config_dict[spaCy_model + ' + ' + transformer_model]

    return conf_file

# Function to create NLP engine based on configuration
def create_nlp_engine(spaCy_model: str, transformer_model: str = None):
    if spaCy_model not in ["en_core_web_lg", "en_core_web_trf"]:
        raise ValueError("Input spaCy model is not supported.")
    if transformer_model is not None:
        if transformer_model not in ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base"]:
            print(transformer_model)
            raise ValueError("Input transformer model is not supported.")
    
    # spaCy model only
    if transformer_model is None:
        configuration = get_configuration(spaCy_model)
        provider = NlpEngineProvider(nlp_configuration=configuration)

    # spaCy model with transformer
    else:
        conf_file = get_conf_file(spaCy_model, transformer_model)
        provider = NlpEngineProvider(conf_file=conf_file)
    
    nlp_engine = provider.create_engine()
    return nlp_engine

# Using only spaCy model
nlp_engine_spacy_only = create_nlp_engine(spaCy_model = "en_core_web_trf")

# Using spaCy model with an additional transformer model
# nlp_engine_with_transformer = create_nlp_engine(spaCy_model = "en_core_web_lg",
#                                                 transformer_model = "StanfordAIMI/stanford-deidentifier-base")

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
# analyzer = AnalyzerEngine(
#     # nlp_engine_spacy_only or nlp_engine_with_transformer
#     nlp_engine = nlp_engine_spacy_only,
#     # nlp_engine = nlp_engine_with_transformer,
#     supported_languages=["en"]
# )

In [77]:
# Change tutor's and student's names to different fake names.
# !pip3 install faker
from faker import Faker

# Create an allow list to exclude words from being identified as PII
# allow_list = [
#     "Today",
#     "today",
#     "Yesterday",
#     "yesterday",
#     "Tomorrow",
#     "tomorrow"
# ]
    
def de_identify_pii(text_transcript):
    # Initialize the analyzer and anonymizer
    analyzer = AnalyzerEngine(
        nlp_engine = nlp_engine_spacy_only,
        supported_languages=["en"]
    )
    anonymizer = AnonymizerEngine()

    # Define date range for generating random dates and generate a random date
    # d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
    # d2 = datetime.strptime('1/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')
    # random_date = (d1 + timedelta(days=random.randint(0, (d2 - d1).days))).strftime('%m/%d/%Y')

    fake = Faker()

    # Function to generate a unique fake name
    def generate_fake_name(existing_names):
        while True:
            fake_name = names.get_first_name()
            if fake_name not in existing_names:
                return fake_name
    
    # Function to generate a unique fake email
    def generate_fake_email(fake_name):
        # Citation: https://gist.github.com/ammarshah/f5c2624d767f91a7cbdc4e54db8dd0bf#file-all_email_provider_domains-txt
        with open('data/all_email_provider_domains.txt', 'r') as file:
            domains = [line.strip() for line in file.readlines()]
        # domains = ["gmail.com", "sina.com", "outlook.com", "icloud.com", "hotmail.com", "yahoo.com"]
        return f"{fake_name.lower()}@{random.choice(domains)}"
    
    # Function to generate a unique fake location
    # def generate_fake_location():
    #     return fake.city()  # Generate a fake city name using Faker

    # Function to generate a unique fake phone number
    def generate_fake_phone_number():
        return f"555-{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    
    # define entities that you want Presidio to detect
    entities = ["PERSON", "EMAIL_ADDRESS", "URL", "PHONE_NUMBER"]

    # Analyze the text to find PII
    results_analyzed = analyzer.analyze(text=text_transcript, language="en", entities=entities, 
                                        score_threshold=None, return_decision_process=True)

    # Create a mapping of original names to unique fake names
    name_mapping = {}
    existing_names = set()
    for result in results_analyzed:
        if result.entity_type == "PERSON":
            original_name = text_transcript[result.start:result.end]
            if original_name not in name_mapping:
                fake_name = generate_fake_name(existing_names)
                name_mapping[original_name] = fake_name
                existing_names.add(fake_name)

    # Email mapping to ensure consistent fake emails
    email_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "EMAIL_ADDRESS":
            original_email = text_transcript[result.start:result.end]
            if original_email not in email_mapping:
                fake_name = generate_fake_name(existing_names)
                fake_email = generate_fake_email(fake_name)
                email_mapping[original_email] = fake_email
    
    # Phone number mapping to ensure consistent fake phone numbers
    phone_mapping = {}
    for result in results_analyzed:
        if result.entity_type == "PHONE_NUMBER":
            original_phone = text_transcript[result.start:result.end]
            if original_phone not in phone_mapping:
                fake_phone = generate_fake_phone_number()
                phone_mapping[original_phone] = fake_phone

    operators = {
        "PERSON": OperatorConfig("custom", {"lambda": lambda text : name_mapping.get(text, text)}),
        # "DATE_TIME": OperatorConfig("replace", {"new_value": random_date}),
        # Add more categories
        "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda text: email_mapping.get(text, text)}),
        # "LOCATION": OperatorConfig("replace", {"new_value": generate_fake_location()}),
        "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda text: phone_mapping.get(text, text)}),
        "URL": OperatorConfig("replace", {"new_value": fake.url()}),
    }

    # Anonymize the text
    results_anonymized = anonymizer.anonymize(
        text=text_transcript,
        analyzer_results=results_analyzed,
        operators=operators
    )

    return results_analyzed, results_anonymized


In [5]:
# Random eyeballing some transcripts
# 15616, 13965, 20229, 4351, 19110 (ID_NUM), 21988 (ID_NUM), 4777 (PHONE_NUM), 609 (ID_NUM)
input_idx = 22599
# print(df.iloc[input_idx].tokens[36])
# print(df.iloc[input_idx].labels[36])
# print("\n")
input_text = df.iloc[input_idx].full_text
print(input_text)

Tool: Visualization (Visual Thinking) – Module 1

Challenge & Selection:

The tool that I have used and selected for challenge is Visualization that I feel plays the major role in  the entire process of design thinking tools. I work in a creative domain where the bulk of my work  depends on the visual representation of any product or approach or business deal. We work on logos,  presentation, graphic designs, illustration etc., To outline the challenge, we’ve started getting more of  negative surveys ratings on the output which we have shared in the recent days. Then I called for a  stakeholder meeting and wanted to use the visual thinking approach to discuss regarding the  challenge and to come up with different solutions. To get those solutions, I use this approach to  make everyone to contribute and think outside box and come up with a new suggestions based on  the data which is available and apart from the data what are other human centric reasons for the dip  in the survey. To ela

In [23]:
# input_tokens = df.iloc[input_idx].tokens
# input_tokens

In [22]:
# input_labels = df.iloc[input_idx].labels
# print(input_labels[709])
# print(input_tokens[709])

In [24]:
# results_analyzed, results_anonymized = de_identify_pii(input_text)

In [21]:
# Print results_analyzed and results_anonymized
# results_analyzed is a list
# print(results_analyzed)
# for res in results_analyzed:
#     print(res)
# print("-----------------------------------------------------------------------------------------")
# print(results_anonymized)

In [25]:
# for res in results_analyzed:
#     print(f"PII: {input_text[res.start:res.end]} ---- start: {res.start} ---- end: {res.end} ---- type: {res.entity_type}")

In [83]:
# from typing import List, Tuple

# # Define the type alias for PII entity
# type pii_entity = Tuple[int, str, str, Tuple[int, int]]

# def remove_overlapping_entities(entities: List[pii_entity]) -> List[pii_entity]:
#     # Sort by essay index, start position, and length (descending)
#     entities = sorted(entities, key=lambda x: (x[0], x[3][0], -(x[3][1] - x[3][0])))
#     filtered_entities: List[pii_entity] = []
#     last_end = -1
#     last_index = -1

#     for entity in entities:
#         input_idx, entity_text, entity_type, (start, end) = entity
        
#         # Remove entities that overlap with previously accepted entities in the same essay
#         if input_idx != last_index or start >= last_end:
#             filtered_entities.append(entity)
#             last_end = end
#             last_index = input_idx
#         else:
#             # Check if the overlapping entity is of higher priority
#             if filtered_entities and start < filtered_entities[-1][3][1]:
#                 if end - start > filtered_entities[-1][3][1] - filtered_entities[-1][3][0]:
#                     filtered_entities[-1] = entity
#                 elif end - start == filtered_entities[-1][3][1] - filtered_entities[-1][3][0]:
#                     if entity_type == "EMAIL_ADDRESS" and filtered_entities[-1][2] == "URL":
#                         filtered_entities[-1] = entity

#     return filtered_entities

# # Function to highlight text
# # def highlight_text(text, locations):
# #     highlighted_text = ""
# #     last_end = 0
    
# #     # for start, end, _ in sorted(locations, key=lambda x: x[0]):
# #     for start, end, _ in locations:
# #         highlighted_text += text[last_end:start]
# #         highlighted_text += f'\x1b[6;30;42m{text[start:end]}\x1b[0m'
# #         last_end = end
    
# #     highlighted_text += text[last_end:]
# #     return highlighted_text

# # Extract positions and types from Presidio results
# # def extract_positions(results):
# #     positions = []
# #     for res in results:
# #         # positions.append((res.start, res.end, res.entity_type))
# #         positions.append((input_idx, input_text[res.start:res.end], res.entity_type, (res.start, res.end)))
# #     return positions


In [20]:
# # Extract positions for highlighting
# positions_analyzed = extract_positions(results_analyzed)
# positions_anonymized = extract_positions(results_anonymized.items)

# print("positions_analyzed (before removing overlaps):")
# for pos in positions_analyzed:
#     # print(f"PII: {input_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")
#     print(pos)
# print(len(positions_analyzed))
# # Remove overlapping entities from the positions
# positions_analyzed = remove_overlapping_entities(positions_analyzed)
# positions_anonymized = remove_overlapping_entities(positions_anonymized)

# print("-----------------------------------------------------------------------------------------")
# print("positions_analyzed (after removing overlaps) and sorted:")
# for pos in positions_analyzed:
#     # print(f"PII: {input_text[pos[0]:pos[1]]} ---- start: {pos[0]} ---- end: {pos[1]} ---- type: {pos[2]}")
#     print(pos)
# print(len(positions_analyzed))

In [19]:
# Do NOT run this code chunk!
from typing import List, Tuple

# Define the type alias for PII entity
type pii_entity = Tuple[int, str, str, Tuple[int, int]]

# Function to analyze text with Presidio and return PII entities
def analyze_texts_with_presidio(df: pd.DataFrame) -> List[pii_entity]:
    pii_entities: List[pii_entity] = []
    
    for i, row in df.iterrows():
        print(f"Processing Row {i} ----------")
        text = row.full_text
        results_analyzed, results_anonymized = de_identify_pii(text)
        
        for result in results_analyzed:
            start = result.start
            end = result.end  # Presidio's end index is exclusive
            entity_text = text[start:end]
            pii_entities.append((i, entity_text, result.entity_type, (start, end)))
    
    return pii_entities

# Function to append PII entities to an existing file
def append_entities_to_file(entities: List[pii_entity], file_path: str, indent: int = 4):
    with open(file_path, 'a') as f:
        for entity in entities:
            indent_space = ' ' * indent
            entity_str = f"{entity}\n"
            f.write(indent_space + entity_str)

# Analyze the next N rows
pii_entities_detected = analyze_texts_with_presidio(df[len(df):len(df)])

# Append the new detected PII entities to the existing file
output_file = "output/pii_detected_trf.txt"
append_entities_to_file(pii_entities_detected, output_file)

print(f"Appended detected PII entities to {output_file}")


Processing Row 12000 ----------
Processing Row 12001 ----------
Processing Row 12002 ----------
Processing Row 12003 ----------
Processing Row 12004 ----------
Processing Row 12005 ----------
Processing Row 12006 ----------
Processing Row 12007 ----------
Processing Row 12008 ----------
Processing Row 12009 ----------
Processing Row 12010 ----------
Processing Row 12011 ----------
Processing Row 12012 ----------
Processing Row 12013 ----------
Processing Row 12014 ----------
Processing Row 12015 ----------
Processing Row 12016 ----------
Processing Row 12017 ----------
Processing Row 12018 ----------
Processing Row 12019 ----------
Processing Row 12020 ----------
Processing Row 12021 ----------
Processing Row 12022 ----------
Processing Row 12023 ----------
Processing Row 12024 ----------
Processing Row 12025 ----------
Processing Row 12026 ----------
Processing Row 12027 ----------
Processing Row 12028 ----------
Processing Row 12029 ----------
Processing Row 12030 ----------
Processi

In [81]:
import ast

def get_url_indices(file_path: str) -> list:
    url_indices = set()  # Use a set to store unique indices
    
    with open(file_path, 'r') as file:
        for line in file:
            entity = ast.literal_eval(line.strip())
            idx, entity_text, category, (start, end) = entity
            
            if category == 'URL':
                url_indices.add(idx)  # Add to set to ensure uniqueness
    
    return sorted(list(url_indices))  # Convert back to a list if needed

# Example usage
file_path = 'output/pii_detected_trf.txt'
url_indices = get_url_indices(file_path)
print(url_indices)
print(len(url_indices))


[5, 9, 27, 28, 38, 80, 92, 123, 126, 132, 136, 154, 171, 194, 200, 206, 209, 226, 242, 243, 263, 277, 317, 330, 343, 371, 379, 384, 407, 429, 432, 438, 466, 472, 480, 496, 499, 514, 538, 540, 568, 576, 589, 593, 630, 646, 659, 660, 671, 674, 681, 704, 721, 735, 738, 752, 761, 769, 770, 773, 775, 801, 805, 820, 822, 836, 858, 860, 877, 891, 894, 895, 897, 937, 942, 962, 976, 1002, 1003, 1006, 1007, 1020, 1027, 1043, 1049, 1057, 1058, 1059, 1066, 1138, 1159, 1162, 1193, 1199, 1208, 1211, 1226, 1237, 1243, 1296, 1307, 1309, 1312, 1324, 1356, 1361, 1367, 1376, 1385, 1390, 1399, 1408, 1430, 1432, 1440, 1447, 1449, 1478, 1489, 1495, 1512, 1524, 1540, 1550, 1583, 1587, 1590, 1592, 1596, 1614, 1640, 1659, 1663, 1694, 1702, 1717, 1732, 1742, 1760, 1762, 1779, 1792, 1798, 1799, 1813, 1833, 1835, 1851, 1852, 1871, 1876, 1882, 1896, 1897, 1899, 1900, 1912, 1917, 1934, 1943, 1951, 1952, 1955, 1962, 1982, 1991, 2013, 2032, 2063, 2066, 2072, 2077, 2080, 2102, 2113, 2129, 2131, 2144, 2148, 2164, 2209,

In [82]:
# Do NOT run this code chunk!
from typing import List, Tuple

# Define the type alias for PII entity
type pii_entity = Tuple[int, str, str, Tuple[int, int]]

# Function to analyze text with Presidio and return PII entities
def analyze_texts_with_presidio(df: pd.DataFrame) -> List[pii_entity]:
    pii_entities: List[pii_entity] = []
    pii_entities_w_score = []
    
    for i, row in df.iterrows():
        print(f"Processing Row {i} ----------")
        text = row.full_text
        results_analyzed, results_anonymized = de_identify_pii(text)
        
        for result in results_analyzed:
            start = result.start
            end = result.end  # Presidio's end index is exclusive
            score = result.score
            entity_text = text[start:end]
            pii_entities.append((i, entity_text, result.entity_type, (start, end)))
            pii_entities_w_score.append((i, entity_text, result.entity_type, (start, end), score))
    
    return pii_entities, pii_entities_w_score

# Function to append PII entities to an existing file
def append_entities_to_file(entities: List[pii_entity], file_path: str, indent: int = 4):
    with open(file_path, 'w') as f:
        for entity in entities:
            indent_space = ' ' * indent
            entity_str = f"{entity}\n"
            f.write(indent_space + entity_str)

# Analyze the next N rows
pii_entities_detected, pii_entities_detected_w_score = analyze_texts_with_presidio(df.iloc[url_indices])

# Append the new detected PII entities to the existing file
output_file = "output/pii_detected_try.txt"
# append_entities_to_file(pii_entities_detected, output_file)
append_entities_to_file(pii_entities_detected_w_score, "others/pii_detected_url_w_score.txt")

print(f"Appended detected PII entities to {output_file}")


Processing Row 5 ----------


  with torch.cuda.amp.autocast(self._mixed_precision):


Processing Row 9 ----------
Processing Row 27 ----------
Processing Row 28 ----------
Processing Row 38 ----------
Processing Row 80 ----------
Processing Row 92 ----------
Processing Row 123 ----------
Processing Row 126 ----------
Processing Row 132 ----------
Processing Row 136 ----------
Processing Row 154 ----------
Processing Row 171 ----------
Processing Row 194 ----------
Processing Row 200 ----------
Processing Row 206 ----------
Processing Row 209 ----------
Processing Row 226 ----------
Processing Row 242 ----------
Processing Row 243 ----------
Processing Row 263 ----------
Processing Row 277 ----------
Processing Row 317 ----------
Processing Row 330 ----------
Processing Row 343 ----------
Processing Row 371 ----------
Processing Row 379 ----------
Processing Row 384 ----------
Processing Row 407 ----------
Processing Row 429 ----------
Processing Row 432 ----------
Processing Row 438 ----------
Processing Row 466 ----------
Processing Row 472 ----------
Processing Row 48

In [83]:
# Do NOT run this code chunk!
import ast

# Open the input file and create a new file for the output
with open('others/pii_detected_url_w_score.txt', 'r') as input_file, open('others/pii_detected_url_w_score_filtered.txt', 'w') as output_file:
    for line in input_file:
        # Convert the line to a Python object
        entity = ast.literal_eval(line.strip())
        # Check if the entity type is 'URL'
        if entity[2] == 'URL':
            output_file.write(line)

print("Filtered 'URL' entities saved to others/pii_detected_url_w_score_filtered.txt.")


Filtered 'URL' entities saved to others/pii_detected_url_w_score_filtered.txt.


In [84]:
# Do NOT run this code chunk!
import ast

def remove_low_score_urls(detected_file, score_filtered_file, output_file):
    # Load the URLs with their scores
    low_score_urls = []
    with open(score_filtered_file, 'r') as file:
        for line in file:
            entity = ast.literal_eval(line.strip())
            idx, entity_text, category, (start, end), score = entity
            if category == 'URL' and score < 0.6:
                low_score_urls.append((idx, entity_text, category, (start, end)))

    # Filter out the low score URLs from the detected file
    updated_entities = []
    with open(detected_file, 'r') as file:
        for line in file:
            entity = ast.literal_eval(line.strip())
            if entity not in low_score_urls:
                updated_entities.append(line)

    # Save the updated entities to the new file
    with open(output_file, 'w') as file:
        file.writelines(updated_entities)

# Example usage
detected_file = 'output/pii_detected_trf.txt'
score_filtered_file = 'others/pii_detected_url_w_score_filtered.txt'
output_file = 'output/pii_detected_trf_filtered.txt'

remove_low_score_urls(detected_file, score_filtered_file, output_file)
