#### Importing Libraries

In [61]:
# 1️⃣ Setup & Imports
import re
import random
import pandas as pd
import json

##### Creating empty dictionary

In [20]:
# Step-by-step creation of the ICU Data Dictionary

# 1️⃣ Start with an empty dictionary
icu_data_dict_granular = {}

##### .2. Build Patterns

##### 1.2. Stress-Test

##### 1.3. Add Identifiers to Dict

## 1. Patient Identifiers

### 1.1. Names (Full Matches)

##### 1.1.2. Build Pattern

## 4. Clinical & Medical Data

### 4.1. Diagnósticos

### 4.2. História Clínica

### 4.3. Exame Físico

### 4.4. Impressão Evolutiva

##### 1.1.2. Build Pattern

In [21]:

# 2️⃣ Add the main category "Patient_Identifiers"
icu_data_dict_granular["Patient_Identifiers"] = {}

# 3️⃣ Add the "Fields" subcategory
icu_data_dict_granular["Patient_Identifiers"]["Fields"] = {}

# 4️⃣ Add the "Complete_Name" field under "Fields"
icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Complete_Name"] = {}


In [None]:
# Name could include the word "paciente", which has no gender inflection, it could just say "nome" or complete name: "nome completo"
Field_Identifier = [r"\b(?:[Nn]ome)\b",
                    r"[nN]ome\s{0,3}[cC]omplete"
                    r"\b[Nn]ome\s{0,3}(?:d[oa])?\s{0,3}[Pp]aciente\b",
                    r"[Pp]aciente"]

# 1. This is a Regex to check for names that are minimally complete: at least one first name and one surname, providing FULL ABSOLUTE MATCHES with the widest range of patterns possible.
# 2. I did this pattern mostly to train and outstretch Regex possibilities, we probably should use a simpler approach to detecting names.
# 3. Names in Brazil can be really creative (differently from Portugal, that has a law for naming, with a determined list of names that could be given at birth), so we must account for many variations in Brazil.
# 4. Added a specificity to Portuguese names which is the presence of possessive particles ("de","do", "da"). Also included the same characteristics from other languages such as  arabian, dutch, italian, german and spanish, since we had considerable immigration from these groups to Brazil."
# 5. Added specific accentuations that correspond to portuguese such as circunflex "^" and tilda "~"
# 6. Added a possible hyphen that in some cases could separate composite names.
# 7. For this field it is really important not to ignore uppercase.

Content_Identifier =  r"\b[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|de la|al|bin|ibn|el)?\s{0,3}[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|\bde la\b|al|bin|ibn|el)?\s{0,3}-?[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3})*\b",


#### 1.1.2. Stress-Test

In [23]:
# Define components of Portuguese full names
first_names = ["João", "Maria", "Carlos", "Fernanda", "Antônio", "Ênio", "Beatriz", "Luís", "Ana", "Paulo", "Júlia", "Eduardo"]
middle_names = ["José", "Clara", "Júnior", "Paulo", "Pedro", "Silva", "Jara", "Miguel", "Henrique", "Neves", "Vales"]
last_names = ["Santos", "Costa", "Souza", "Nascimento", "Pereira", "Oliveira", "Vales", "Ribeiro", "Mendes", "Rodrigues"]
particles = ["da", "de", "do", "dos", "das", "di", "von der","al", "ibn", "von"]

# Generate random full names with variation
def generate_portuguese_names(n):
    names = []
    for _ in range(n):
        first = random.choice(first_names)
        middle = random.choice(middle_names) if random.random() > 0.5 else ""
        particle = random.choice(particles) if random.random() > 0.3 else ""
        last = random.choice(last_names)
        hyphen = "-" if random.random() > 0.8 else " "
        
        # Construct name with variations
        full_name = first
        if middle:
            full_name += hyphen + middle if random.random() > 0.7 else " " + middle
        if particle:
            full_name += " " + particle
        full_name += " " + last

        # Add diacritic variations randomly
        full_name = full_name.replace("a", "à") if random.random() > 0.9 else full_name
        full_name = full_name.replace("o", "ô") if random.random() > 0.9 else full_name
        full_name = full_name.replace("e", "ê") if random.random() > 0.9 else full_name

        names.append(full_name)
    return names

# Generate test dataset
test_names = generate_portuguese_names(100)

# Define the regex pattern for full names
name_regex_test = re.compile(
    r"\b[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|de la|al|bin|ibn|el)?\s{0,3}[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|\bde la\b|al|bin|ibn|el)?\s{0,3}-?[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3})*\b",

)

# Test regex against generated names
matches_test = [name for name in test_names if name_regex_test.fullmatch(name)]

# Display results
df_results_test = pd.DataFrame({"Generated Name": test_names, "Matched": ["✅" if name in matches_test else "❌" for name in test_names]})


In [24]:
df_results_test #The names that failed are not "real names", so the identifier seems to passed the stress-test.

Unnamed: 0,Generated Name,Matched
0,Ênio Paulo Souza,✅
1,Luís al Nascimento,✅
2,Ênio Vales von Rodrigues,✅
3,Maria Ribeiro,✅
4,Fêrnanda Jara das Rodriguês,✅
...,...,...
95,Carlos Neves von Rodrigues,✅
96,Anà Pedro von der Ribeiro,✅
97,Carlôs Silva de Mendes,✅
98,Antônio al Rodrigues,✅


#### 1.1.3. Add identifiers to dict

In [None]:

# 5️⃣ Add the subkeys: "Field_Identifier", "Content_Identifier", and "DLP_Strategy"
icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Complete_Name"]["Field_Identifier"] = [
    r"\b(?:[Nn]ome)\b",
    r"[nN]ome\s{0,3}[cC]omplete"
    r"\b[Nn]ome\s{0,3}(?:d[oa])?\s{0,3}[Pp]aciente\b",
    r"[Pp]aciente"]

icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Complete_Name"]["Content_Identifier"] = r"\b[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|de la|al|bin|ibn|el)?\s{0,3}[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|\bde la\b|al|bin|ibn|el)?\s{0,3}-?[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3})*\b"

icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Complete_Name"]["DLP_Strategy"] = "Detects full names using regex-based entity recognition."

### Name and Surname (partial Matches)

##### 1.2. Build Patterns

In [None]:
# Field can be either "Nome" (First Name) or "Sobrenome" (Last Name), it is also possible. In Brazil we do not say middle-names, we just call all of them surnames "sobrenomes", so we pluralize.
Field_Identifier = [r"\b(?:[Nn]ome)\b",r"\b[Pp]rimeiro\s{0,3}[nN]ome", r"\b[Ss]obrenomes?\b"]

# Regex for single names or single surnames, allowing for hyphenated names
Content_Identifier = r"\b(?:da|de|do|dos|das|di|del|della|degli|dei|von|van|\bvon der\b|\bvan der\b|\bvan den\b|du|de la|al|bin|ibn|el)?\s{0,3}[A-ZÀ-Ÿ][a-zà-ÿ]+\s{0,3}-?(?:[A-ZÀ-Ÿ][a-zà-ÿ]+)?\b"


##### 1.2.1. Stress-Test

In [48]:
test_names = ["João", "Fernanda", "Ana-Clara", "Silva", "Mendes", "dos Santos", "Medeiros", "von Dorf","van Dorf", "João das Marias"]
name_regex_test = re.compile(Content_Identifier)

# Test regex against generated names
matches_test = [name for name in test_names if name_regex_test.fullmatch(name)]

# Display results
df_results_test = pd.DataFrame({"Generated Name": test_names, "Matched": ["✅" if name in matches_test else "❌" for name in test_names]})
df_results_test

Unnamed: 0,Generated Name,Matched
0,João,✅
1,Fernanda,✅
2,Ana-Clara,✅
3,Silva,✅
4,Mendes,✅
5,dos Santos,✅
6,Medeiros,✅
7,von Dorf,✅
8,van Dorf,✅
9,João das Marias,❌


##### 1.3. Add Identifiers to Dict

In [59]:
icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Name_or_Surname"] = {
    "Field_Identifier": Field_Identifier,
    "Content_Identifier": Content_Identifier,
    "DLP_Strategy": "Detects individual names or surnames when stored in separate fields."
}


##### 1.3. Build Patterns

In [54]:
# Field variations for "Número de Atendimento"
Field_Identifier = [
    r"\b(?:n[úu]mero|N\.?|#|Nº|I[Dd]|C[óo]digo|identifica(?:ção|cao))\s{0,3}(?:de)?\s{0,3}atendimento\b",
    r"\batendimento\s{0,3}(?:n[úu]mero|N\.?|#|Nº|I[Dd]|C[óo]digo|identifica(?:ção|cao))\b"
]

# Patient ID is usually a 5 to 10-digit number
Content_Identifier = r"\b\d{5,10}\b"


##### 1.2. Stress-Test

In [56]:
test_patient_ids = ["12345", "9876543210", "00001", "ABCDE"]
id_regex_test = re.compile(Content_Identifier)

# Test regex against generated IDs
matches_test = [pid for pid in test_patient_ids if id_regex_test.fullmatch(pid)]

# Display results
df_results_test = pd.DataFrame({"Generated ID": test_patient_ids, "Matched": ["✅" if pid in matches_test else "❌" for pid in test_patient_ids]})
df_results_test

Unnamed: 0,Generated ID,Matched
0,12345,✅
1,9876543210,✅
2,00001,✅
3,ABCDE,❌


##### 1.3. Add Identifiers to Dict

In [None]:
icu_data_dict_granular["Patient_Identifiers"]["Fields"]["Name_or_Surname"] = {
    "Field_Identifier": Field_Identifier,
    "Content_Identifier": Content_Identifier,
    "DLP_Strategy": "Detects individual names or surnames when stored in separate fields."
}


##### .2. Build Patterns

In [None]:
# Field variations for "Nº Prontuário"
Field_Identifier = [r"\b(?:Nº Prontuário|Número do Prontuário)\b"]

# Prontuário numbers are usually between 5 and 10 digits
Content_Identifier = r"\b\d{5,10}\b"


##### 1.2. Stress-Test

In [None]:
test_record_numbers = ["54321", "1234567890", "98765", "A123B"]
record_regex_test = re.compile(Content_Identifier)

# Test regex against generated record numbers
matches_test = [rec for rec in test_record_numbers if record_regex_test.fullmatch(rec)]

# Display results
df_results_test = pd.DataFrame({"Generated Record Number": test_record_numbers, "Matched": ["✅" if rec in matches_test else "❌" for rec in test_record_numbers]})
tools.display_dataframe_to_user(name="Nº Prontuário Regex Test Results", dataframe=df_results_test)


##### 1.3. Add Identifiers to Dict

In [None]:
                "Field_Regex": [r"\b(?:n[úu]mero|N\.?|#|Nº|I[Dd]|C[óo]digo|identifica(?:ção|cao))\s{0,3}(?:de)?\s{0,3}atendimento\b",
                                r"\batendimento\s{0,3}(?:n[úu]mero|N\.?|#|Nº|I[Dd]|C[óo]digo|identifica(?:ção|cao))\b"], 
                "Content_Regex": r"\b\d{5,10}\b",
                "DLP_Strategy": "Detects numerical patient identifiers."

In [None]:

# 3️⃣ Hospitalization Data
icu_data_dict_granular["Hospitalization_Data"] = {"Fields": {}}

## **Build Pattern**
icu_data_dict_granular["Hospitalization_Data"]["Fields"]["DIH"] = {
    "Field_Regex": r"\b(?:DIH|Data da Internação Hospitalar)\b",
    "Content_Regex": r"\b\d{2}/\d{2}/\d{4}\b",
    "DLP_Strategy": "Detects hospital admission dates."
}

## **Stress Test**
hospitalization_dates = ["12/05/2023", "03/10/2019", "29/07/2022"]
dih_regex = re.compile(icu_data_dict_granular["Hospitalization_Data"]["Fields"]["DIH"]["Content_Regex"])
matches = [date for date in hospitalization_dates if dih_regex.fullmatch(date)]
df_results = pd.DataFrame({"Generated Date": hospitalization_dates, "Matched": ["✅" if date in matches else "❌" for date in hospitalization_dates]})
tools.display_dataframe_to_user(name="Hospitalization Date Regex Test", dataframe=df_results)

# 4️⃣ Insurance & Financial Data
icu_data_dict_granular["Insurance_Financial_Data"] = {"Fields": {}}

## **Build Pattern**
icu_data_dict_granular["Insurance_Financial_Data"]["Fields"]["Convênio"] = {
    "Field_Regex": r"\b(?:Convênio|Seguro de Saúde)\b",
    "Content_Regex": r"\b\d{10,16}\b",
    "DLP_Strategy": "Detects insurance numbers to prevent fraud."
}

## **Stress Test**
insurance_numbers = ["1234567890", "9876543210123456", "012345678912345"]
insurance_regex = re.compile(icu_data_dict_granular["Insurance_Financial_Data"]["Fields"]["Convênio"]["Content_Regex"])
matches = [num for num in insurance_numbers if insurance_regex.fullmatch(num)]
df_results = pd.DataFrame({"Generated Insurance Numbers": insurance_numbers, "Matched": ["✅" if num in matches else "❌" for num in insurance_numbers]})
tools.display_dataframe_to_user(name="Insurance Number Regex Test", dataframe=df_results)

# 5️⃣ Clinical & Medical Data
icu_data_dict_granular["Clinical_Medical_Data"] = {"Fields": {}}

## **Build Pattern**
icu_data_dict_granular["Clinical_Medical_Data"]["Fields"]["Diagnósticos"] = {
    "Field_Regex": r"\b(?:Diagnóstico|CID-10|Doença)\b",
    "Content_Regex": r"(?:Sepsis|Pneumonia|Hipertensão|Diabetes)",
    "DLP_Strategy": "Uses NLP-based classification to recognize medical conditions."
}

## **Stress Test**
medical_conditions = ["Sepsis", "Diabetes", "Hipertensão", "Pneumonia"]
condition_regex = re.compile(icu_data_dict_granular["Clinical_Medical_Data"]["Fields"]["Diagnósticos"]["Content_Regex"])
matches = [cond for cond in medical_conditions if condition_regex.fullmatch(cond)]
df_results = pd.DataFrame({"Generated Medical Conditions": medical_conditions, "Matched": ["✅" if cond in matches else "❌" for cond in medical_conditions]})
tools.display_dataframe_to_user(name="Medical Condition Regex Test", dataframe=df_results)

# 6️⃣ Medication & Treatment
icu_data_dict_granular["Medication_Treatment"] = {"Fields": {}}

## **Build Pattern**
icu_data_dict_granular["Medication_Treatment"]["Fields"]["Prescrição"] = {
    "Field_Regex": r"\b(?:Prescrição|Receita Médica)\b",
    "Content_Regex": r"\b(?:Paracetamol|Ibuprofeno|Morfina|Fentanil)\b",
    "DLP_Strategy": "Flags prescription of controlled substances."
}

# 7️⃣ Final Steps: Export Dictionary
with open("icu_dlp_dictionary.json", "w", encoding="utf-8") as f:
    json.dump(icu_data_dict_granular, f, indent=4, ensure_ascii=False)

print("ICU DLP Dictionary saved as JSON ✅")
