In [1]:
import spacy
import random
import pandas as pd
from faker import Faker



In [2]:
#!python -m spacy download en_core_web_sm

### __NER to extract patient names from Clinical Notes dataset__

In [3]:
df = pd.read_csv('clinical_notes_dataset.csv')
clinical_note_list = df['Clinical_Note'].to_list()

nlp = spacy.load("en_core_web_sm")
# spacy.require_gpu()

patient_name_list = []
for patient_text in clinical_note_list:
    doc = nlp(patient_text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            patient_name_list.append(ent.text)    

# print(patient_name_list)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
len(patient_name_list)

99

### __Generate pool of Surrogate names for Substitution__

In [5]:
fake = Faker()

# generating names that are not in our dataset
pool = set()

while len(pool) < 500:
    name = fake.name()
    if name not in patient_name_list:
        pool.add(name)

surrogate_pool_of_patient_names = list(pool)

In [6]:
surrogate_pool_of_patient_names

['Frank Peters',
 'Madeline Ryan',
 'Sara Huang',
 'Taylor Dalton',
 'Marco Mcdaniel',
 'Cory Williams',
 'Louis Miles',
 'Mrs. Tamara Martinez',
 'David Gray',
 'Crystal Cabrera',
 'Amber Harris',
 'Rodney Rodriguez',
 'Tracy Briggs',
 'James Guzman',
 'Sara Gonzalez',
 'Antonio Wilson',
 'Michael Peterson',
 'Mrs. Mary Hendrix',
 'Jason Ruiz',
 'Marvin Thomas',
 'Samantha Sweeney',
 'Kimberly Atkinson',
 'David Hatfield',
 'Gabriel Montgomery',
 'Hannah Li',
 'Patricia Johnson',
 'Cheryl Barr',
 'Benjamin Williams',
 'Kimberly Hoover',
 'Jerry Valdez',
 'Ivan Brown',
 'Rebecca Griffith',
 'Nicholas Ball',
 'Dennis Hall',
 'Andrew Holloway',
 'Eric Harris',
 'Lindsey Green',
 'Heather Reeves',
 'Sean Perry',
 'Christine Richards',
 'Shelley Martinez',
 'Timothy Fuentes',
 'Corey Fitzpatrick',
 'Angela Curtis',
 'John Flores',
 'Justin Fowler',
 'Thomas Wheeler',
 'Sherry Reilly',
 'Sean Wiggins',
 'Maria Ryan',
 'Matthew Townsend',
 'Benjamin Perkins',
 'Rebecca Steele',
 'David Dean'

###  __Algorithm for Markov Chain Based Surrogate Substitution__

In [7]:
def markov_chain_substitution(patient_name_list: list, surrogate_pool: list, transition_prob = 0.5) -> list:
    """
    The function `markov_chain_substitution` generates a list of surrogate names based on a Markov chain
    model with a given transition probability.
    
    :return: List of surrogates generated based on a Markov chain substitution algorithm applied to the
    input `patient_name_list`.
    """
    surrogates = [] # to store the surrogates 
    previous_surrogate = None

    for _ in patient_name_list:
        if previous_surrogate is None or random.random() < transition_prob:
            #choose a new surrogate
            new_surrogate = random.choice(surrogate_pool)
            previous_surrogate = new_surrogate
        
        else:
            #reuse the previous surrogate
            new_surrogate = previous_surrogate
        

        surrogates.append(new_surrogate)
    
    return surrogates

This method 

### __Simulating False Negatives (FNER)__

In [8]:
def simulate_fner(original_patient_names: list, surrogate_names: list, fner: float) -> list:
    """
    The function `simulate_fner` simulates the occurrence of false negatives in replacing patient names
    with surrogate names based on a given false negative error rate.
    
    :return: List of final names, where each name is either the surrogate name or the original name
    based on a simulated false negative error rate (`fner`).
    """
    final_names = []
    
    for original_name, surrogate_name in zip(original_patient_names, surrogate_names):
        if random.random() < fner:
            # simulate a false negative (leave the original name unchanged)
            final_names.append(original_name)
        else:
            # use the surrogate name
            final_names.append(surrogate_name)
    
    return final_names


### __Calculate the PHI leakage__

In [9]:
def calculate_phi_leakage(original_names: list, final_names: list) -> tuple:
    """
    The function `calculate_phi_leakage` calculates the number and percentage of leaked names between
    two lists of original and final names.
    
    :return: Tuple containing two values: the number of leaked names and the leakage rate as a percentage.
    """
    leaked_names = [orig for orig, final in zip(original_names, final_names) if orig == final]
    leakage_rate = len(leaked_names) / len(original_names) * 100
    
    return len(leaked_names), leakage_rate


### __Measure Surrogate Repeat Size__

In [10]:
from collections import Counter

def measure_surrogate_repeat_size(substituted_names: list) -> int:
    """
    The function `measure_surrogate_repeat_size` calculates and returns the maximum number of times a
    name appears in a list of substituted names.
    
    :return: The maximum number of times a name appears in the list of substituted names.
    """
    name_counts = Counter(substituted_names)
    max_repeat_size = max(name_counts.values())
    
    return max_repeat_size

***
## __Evaluation__


The evaluation of the Markov Chain Substition method is done over different values of FNER. Hence, the function below `fner_evaluation` is a `wrapper` function that makes use of the previously defined functions (i.e `markov_chain_substitution`, `simulate_fner`, `calculate_phi_leakage` and `measure_surrogate_repeat_size` ) to assess the PHI leakage and surrogte repeat size under different levels of FNER.

In [11]:
def fner_evaluation(fner_values: list, patient_name_list: list, surrogate_pool_of_patient_names: list) -> pd.DataFrame:
    """
    The function `fner_evaluation` takes a list of FNER values, patient names, and a pool of surrogate
    names to evaluate the PHI leakage and surrogate repeat size for each FNER value.
    
    :return: A pandas DataFrame containing the evaluation results for each FNER value in the input list `fner_values`. 
    The DataFrame includes columns for FNER value, Leakage Count, PHI Leakage Rate (%), and Maximum Surrogate Repeat Size.
    """
    results = []

    for fner in fner_values:

        markov_substituted_names = markov_chain_substitution(patient_name_list, surrogate_pool_of_patient_names)
        
        simulated_names = simulate_fner(patient_name_list, markov_substituted_names, fner=fner)

        leaked_count, leakage_rate = calculate_phi_leakage(patient_name_list, simulated_names)
        
        max__surrogate_repeat_size = measure_surrogate_repeat_size(simulated_names)

        results.append(
            {
                "FNER" : fner, 
                "Leakage Count" : leaked_count, 
                "PHI Leakage Rate (%)" : leakage_rate, 
                "Maximum Surrogate Repeat Size" : max__surrogate_repeat_size


            }   
        )

    
    df = pd.DataFrame(results)
    return df

In [12]:
# Range of FNER values for this research
fner_values=[0.01, 0.05, 0.1]

In [13]:
markov_results = fner_evaluation(fner_values=fner_values, patient_name_list=patient_name_list, surrogate_pool_of_patient_names=surrogate_pool_of_patient_names)


In [14]:
markov_results

Unnamed: 0,FNER,Leakage Count,PHI Leakage Rate (%),Maximum Surrogate Repeat Size
0,0.01,0,0.0,7
1,0.05,2,2.020202,5
2,0.1,7,7.070707,7



***


## __Now, let's compare the effectiveness of the Markov approach with the `consistent` and `random` strategies__

### __A: Consistent Substitution Strategy__

In [15]:
def consistent_substitution(patient_name_list: list, consistent_name: str) -> list:
    """
    The function `consistent_substitution` takes a list of patient names and a consistent name, and
    returns a list where each element is the consistent name.
    
    :return: A list where each element is the `consistent_name` string, repeated for each element in the
    `patient_name_list`.
    """
    return [consistent_name for _ in patient_name_list]


In [16]:
def consistent_fner_evaluation(fner_values: list, patient_name_list: list, consistent_name: str) -> pd.DataFrame:
    """
    The function `consistent_fner_evaluation` takes a list of FNER values, a list of patient names, and
    a consistent name, then evaluates the FNER impact on PHI leakage and surrogate repeat size for each
    FNER value.
    
    :return: A pandas DataFrame containing evaluation results for different FNER values, including the
    FNER value itself, leakage count, PHI leakage rate, and maximum surrogate repeat size.
    """ 
    results = []

    for fner in fner_values:

        consistent_names = consistent_substitution(patient_name_list, consistent_name)
       
        simulated_names = simulate_fner(patient_name_list, consistent_names, fner=fner)

        leaked_count, leakage_rate = calculate_phi_leakage(patient_name_list, simulated_names)

        max__surrogate_repeat_size = measure_surrogate_repeat_size(simulated_names)

        results.append(
            {
                "FNER" : fner, 
                "Leakage Count" : leaked_count, 
                "PHI Leakage Rate (%)" : leakage_rate, 
                "Maximum Surrogate Repeat Size" : max__surrogate_repeat_size
            }
        )

    df = pd.DataFrame(results)
    return df


##### __Consistent Substitution Evaluation Summary__

In [17]:
# Evaluation for Consistent Substitution
consistent_name = "Adarsha Pandey"  # Consistent name to use for all replacements
consistent_results = consistent_fner_evaluation(fner_values, patient_name_list, consistent_name)


print("Consistent Substitution Results:")
consistent_results


Consistent Substitution Results:


Unnamed: 0,FNER,Leakage Count,PHI Leakage Rate (%),Maximum Surrogate Repeat Size
0,0.01,0,0.0,99
1,0.05,6,6.060606,93
2,0.1,15,15.151515,84


### __B: Random Substitution Strategy__

In [18]:
def random_substitution(patient_name_list: list, surrogate_pool: list) -> list:
    """
    The function `random_substitution` takes a list of patient names and a pool of surrogate names, and
    returns a list of randomly selected surrogate names for each patient name.
    
    :return: A list of randomly selected elements from the surrogate_pool list, with the same length as
    the patient_name_list.
    """
    return [random.choice(surrogate_pool) for _ in patient_name_list]


In [19]:
def random_fner_evaluation(fner_values: list, patient_name_list: list, surrogate_patient_names: list) -> pd.DataFrame:
    """
    The function `random_fner_evaluation` takes a list of FNER values, patient names, and surrogate
    patient names, simulates FNER with random substitutions, calculates leakage metrics, and returns a
    DataFrame with evaluation results.

    :return: A pandas DataFrame containing the evaluation results for different False Name Error Rate (FNER) values. 
    The DataFrame includes columns for FNER, Leakage Count, PHI Leakage Rate (%), and Maximum Surrogate Repeat Size.
    """    
    results = []

    for fner in fner_values:

        random_names = random_substitution(patient_name_list, surrogate_patient_names)
        
        simulated_names = simulate_fner(patient_name_list, random_names, fner=fner)

        leaked_count, leakage_rate = calculate_phi_leakage(patient_name_list, simulated_names)

        max__surrogate_repeat_size = measure_surrogate_repeat_size(simulated_names)

        results.append(
            {
                "FNER" : fner, 
                "Leakage Count" : leaked_count, 
                "PHI Leakage Rate (%)" : leakage_rate, 
                "Maximum Surrogate Repeat Size" : max__surrogate_repeat_size

            }
        )

    df = pd.DataFrame(results)
    return df


##### __Random Substitution Evaluation Summary__

In [20]:
# Evaluation for Random Substitution
random_results = random_fner_evaluation(fner_values, patient_name_list, surrogate_pool_of_patient_names)

print("Random Substition Results:")
random_results

Random Substition Results:


Unnamed: 0,FNER,Leakage Count,PHI Leakage Rate (%),Maximum Surrogate Repeat Size
0,0.01,0,0.0,2
1,0.05,5,5.050505,2
2,0.1,9,9.090909,3


***

## __Final Comparison of the Markov, Consistent, and Random Substitution Strategies__

- The final comparison of 3 strategies focuses on the PHI leakage rate in percentage accross different levels of FNER 

In [21]:
eval_summary_dict = {
    "FNER" : fner_values, 
    "Consistent" : consistent_results["PHI Leakage Rate (%)"].tolist(), 
    "Random": random_results["PHI Leakage Rate (%)"].tolist(), 
    "Markov": markov_results["PHI Leakage Rate (%)"].tolist(), 
}

In [22]:
df_eval = pd.DataFrame(eval_summary_dict)
print("----PHI Leakage Rate (%) Comparison----")
df_eval

----PHI Leakage Rate (%) Comparison----


Unnamed: 0,FNER,Consistent,Random,Markov
0,0.01,0.0,0.0,0.0
1,0.05,6.060606,5.050505,2.020202
2,0.1,15.151515,9.090909,7.070707
