In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os
import spacy
import random
from faker import Faker
from sklearn.metrics import precision_score, recall_score, f1_score

# Reading Narratives

In [None]:
narrative_1 = pd.read_excel('CHANGE_FOR_YOUR_DIRECTORY.xlsx')
narrative_1

narrative_2 = pd.read_excel('CHANGE_FOR_YOUR_DIRECTORY.xlsx')
narrative_2

# Data Processing

In [None]:
anonymized_file_path = ('CHANGE_FOR_YOUR_DIRECTORY.xlsx')
narrative_2 = pd.read_excel(anonymized_file_path, header=None)

new_headers = ["HEADERS THAT YOU CAN CHANGE IN A LIST"]

narrative_2.columns = new_headers

output_path = 'NEW_OUTPUT_FILE_AFTER_MOD.xlsx'
narrative_2.to_excel(output_path, index=False)

narrative_2.head()


# Filling Real Narratives from the 1st Dataset


In [None]:
fake = Faker()

def generate_fake_data():
    return {
        'LOCATION': fake.city(),
        'NAME': fake.name(),
        'DATE': fake.date(),
        'EMAIL': fake.email(),
        'PHONE': fake.phone_number(),
        'ADDRESS': fake.address(),
        'RACE': np.random.choice(['Caucasian', 'Hispanic', 'African-American', 'Asian', 'Mixed']),
        'RELIGION': np.random.choice(['Christian', 'Muslim', 'Jewish', 'Buddhist', 'Atheist', 'None'], p=[0.3, 0.2, 0.1, 0.1, 0.2, 0.1])
    }

def deanonymize_narrative(anonymized_text, fake_data):
    if pd.isna(anonymized_text):
        return anonymized_text
    deanonymized_text = str(anonymized_text)
    placeholders = ['(NAME)', '(LOCATION)', '(DATE)', '(EMAIL)', '(PHONE)', '(ADDRESS)', '(RACE)', '(RELIGION)']
    for placeholder in placeholders:
        if placeholder in deanonymized_text:
            deanonymized_text = deanonymized_text.replace(placeholder, fake_data[placeholder.strip('()')])
    return deanonymized_text

anonymized_file_path = 'ORIGINAL_DATA.xlsx'
output_path_deanonymized = 'ORIGINAL_DATA_DEANONYMIZED.xlsx'

anonymized_data = pd.read_excel(anonymized_file_path, header=0)

columns_to_deanonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem']

rows_changed = []

for i in range(len(anonymized_data)):
    fake_data = generate_fake_data()
    row_changed = False
    for column in columns_to_deanonymize:
        original_text = anonymized_data.at[i, column]
        new_text = deanonymize_narrative(original_text, fake_data)
        if original_text != new_text:
            anonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2)

anonymized_data.to_excel(output_path_deanonymized, index=False)

print(f"Rows changed: {rows_changed}")

print(anonymized_data.head())


# Filling Real Narratives From 2nd Dataset with Fake Data

In [None]:
fake = Faker()

def generate_fake_data():
    return {
        'LOCATION': fake.city(),
        'NAME': fake.name(),
        'DATE': fake.date(),
        'EMAIL': fake.email(),
        'PHONE': fake.phone_number(),
        'ADDRESS': fake.address(),
        'RACE': np.random.choice(['Caucasian', 'Hispanic', 'African-American', 'Asian', 'Mixed']),
        'RELIGION': np.random.choice(['Christian', 'Muslim', 'Jewish', 'Buddhist', 'Atheist', 'None'], p=[0.3, 0.2, 0.1, 0.1, 0.2, 0.1])
    }

def deanonymize_narrative(anonymized_text, fake_data):
    if pd.isna(anonymized_text):
        return anonymized_text
    deanonymized_text = str(anonymized_text)
    placeholders = ['(NAME)', '(LOCATION)', '(DATE)', '(EMAIL)', '(PHONE)', '(ADDRESS)', '(RACE)', '(RELIGION)']
    for placeholder in placeholders:
        if placeholder in deanonymized_text:
            deanonymized_text = deanonymized_text.replace(placeholder, fake_data[placeholder.strip('()')])
    return deanonymized_text

anonymized_file_path = 'ORIGINAL_DATA_2.xlsx'
output_path_deanonymized = 'ORIGINAL_DATA_2_DEANONYMIZED.xlsx'

anonymized_data = pd.read_excel(anonymized_file_path, header=0)

columns_to_deanonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem', 'Dispatch']

rows_changed = []

for i in range(len(anonymized_data)):
    fake_data = generate_fake_data()
    row_changed = False
    for column in columns_to_deanonymize:
        original_text = anonymized_data.at[i, column]
        new_text = deanonymize_narrative(original_text, fake_data)
        if original_text != new_text:
            anonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2)

anonymized_data.to_excel(output_path_deanonymized, index=False)

print(f"Rows changed: {rows_changed}")

print(anonymized_data.head())


# Anonymization of Dataset I and Dataset II

In [None]:
nlp = spacy.load("en_core_web_sm")

def anonymize_narrative(text):
    if pd.isna(text):
        return text

    doc = nlp(str(text))
    anonymized_text = text

    for ent in doc.ents:
        anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")

    anonymized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', '[DATE]', anonymized_text)
    anonymized_text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', anonymized_text)

    phone_pattern = r'(\(\d{3}\)\d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+?\(?\d{1,3}\)?[-\s\.]?\d{1,3}[-\s\.]?\d{3}[-\s\.]?\d{4})'
    anonymized_text = re.sub(phone_pattern, '[PHONE]', anonymized_text)

    return anonymized_text

deanonymized_file_path = 'DEANONYMIZED_ORIGINAL_DATA.xlsx'
anonymized_again_file_path = 'ANONYMIZED_ORIGINAL_DATA.xlsx'

deanonymized_data = pd.read_excel(deanonymized_file_path)

columns_to_anonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem']

rows_changed = []

for i in range(len(deanonymized_data)):
    row_changed = False
    for column in columns_to_anonymize:
        original_text = deanonymized_data.at[i, column]
        new_text = anonymize_narrative(original_text)
        if original_text != new_text:
            deanonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2)  

deanonymized_data.to_excel(anonymized_again_file_path, index=False)

print(f"Rows changed: {rows_changed}")
print(deanonymized_data.head())


In [None]:
nlp = spacy.load("en_core_web_sm")

def anonymize_narrative(text):
    if pd.isna(text):
        return text

    doc = nlp(str(text))
    anonymized_text = text

    for ent in doc.ents:
        anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")

    anonymized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', '[DATE]', anonymized_text)
    anonymized_text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', anonymized_text)

    phone_pattern = r'(\(\d{3}\)\d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+?\(?\d{1,3}\)?[-\s\.]?\d{1,3}[-\s\.]?\d{3}[-\s\.]?\d{4})'
    anonymized_text = re.sub(phone_pattern, '[PHONE]', anonymized_text)

    return anonymized_text

deanonymized_file_path = 'Project_Data/New Header 2023 DATA DEANONYMIZED 2.xlsx'
anonymized_again_file_path = 'Project_Data/New Header 2023 DATA REANONYMIZED 2.xlsx'

try:
    deanonymized_data = pd.read_excel(deanonymized_file_path)
except Exception as e:
    print(f"Error loading the file: {e}")
    raise

columns_to_anonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem', 'Dispatch']

rows_changed = []

for i in range(len(deanonymized_data)):
    row_changed = False
    for column in columns_to_anonymize:
        original_text = deanonymized_data.at[i, column]
        new_text = anonymize_narrative(original_text)
        if original_text != new_text:
            deanonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2) 

try:
    deanonymized_data.to_excel(anonymized_again_file_path, index=False)
    print(f"Anonymized data saved to {anonymized_again_file_path}")
except Exception as e:
    print(f"Error saving the file: {e}")
    raise

print(f"Rows changed: {rows_changed}")
print(deanonymized_data.head())

# Data Metrics for dataset I

In [None]:
nlp = spacy.load("en_core_web_sm")

def anonymize_narrative(text):
    if pd.isna(text):
        return text

    doc = nlp(str(text))
    anonymized_text = text

    for ent in doc.ents:
        anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")

    anonymized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', '[DATE]', anonymized_text)
    anonymized_text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', anonymized_text)

    phone_pattern = r'(\(\d{3}\)\d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+?\(?\d{1,3}\)?[-\s\.]?\d{1,3}[-\s\.]?\d{3}[-\s\.]?\d{4})'
    anonymized_text = re.sub(phone_pattern, '[PHONE]', anonymized_text)

    return anonymized_text

def evaluate_anonymization(original_texts, anonymized_texts):
    y_true = []
    y_pred = []

    for original, anonymized in zip(original_texts, anonymized_texts):
        if pd.isna(original) or pd.isna(anonymized):
            continue

        doc_original = nlp(str(original))
        doc_anonymized = nlp(str(anonymized))

        original_entities = set(ent.text for ent in doc_original.ents)
        anonymized_entities = set(ent.text for ent in doc_anonymized.ents)

        for ent in original_entities:
            y_true.append(1)
            if ent in anonymized_entities:
                y_pred.append(0)
            else:
                y_pred.append(1)

        for ent in anonymized_entities:
            if ent not in original_entities:
                y_true.append(0)
                y_pred.append(1)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return {"precision": precision, "recall": recall, "f1_score": f1}

deanonymized_file_path = 'Project_Data/2023 DATA SCRUBBED DEANONYMIZED.xlsx'
anonymized_again_file_path = 'Project_Data/2023 DATA SCRUBBED REANONYMIZED.xlsx'

deanonymized_data = pd.read_excel(deanonymized_file_path)

columns_to_anonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem']

rows_changed = []

for i in range(len(deanonymized_data)):
    row_changed = False
    for column in columns_to_anonymize:
        original_text = deanonymized_data.at[i, column]
        new_text = anonymize_narrative(original_text)
        if original_text != new_text:
            deanonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2)

deanonymized_data.to_excel(anonymized_again_file_path, index=False)

print(f"Rows changed: {rows_changed}")
print(deanonymized_data.head())

original_texts = deanonymized_data[columns_to_anonymize].values.flatten()
anonymized_texts = [anonymize_narrative(text) for text in original_texts]

results = evaluate_anonymization(original_texts, anonymized_texts)
print(results)


In [None]:
def calculate_entropy(text):

    if not text:
        return 0.0

    char_count = Counter(text)
    total_chars = len(text)
    
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in char_count.values())
    return entropy

deanonymized_file_path = 'Project_Data/2023 DATA SCRUBBED DEANONYMIZED.xlsx'
anonymized_again_file_path = 'Project_Data/2023 DATA SCRUBBED REANONYMIZED.xlsx'

deanonymized_data = pd.read_excel(deanonymized_file_path)

columns_to_anonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem']

rows_changed = []

for i in range(len(deanonymized_data)):
    row_changed = False
    for column in columns_to_anonymize:
        original_text = deanonymized_data.at[i, column]
        new_text = anonymize_narrative(original_text)
        if original_text != new_text:
            deanonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2) 

deanonymized_data.to_excel(anonymized_again_file_path, index=False)

print(f"Rows changed: {rows_changed}")
print(deanonymized_data.head())

original_texts = deanonymized_data[columns_to_anonymize].values.flatten()
anonymized_texts = [anonymize_narrative(text) for text in original_texts]

results = evaluate_anonymization(original_texts, anonymized_texts)
print("Evaluation Results:", results)

filtered_original_texts = [text for text in original_texts if isinstance(text, str)]
filtered_anonymized_texts = [text for text in anonymized_texts if isinstance(text, str)]

original_entropy = [calculate_entropy(text) for text in filtered_original_texts]
anonymized_entropy = [calculate_entropy(text) for text in filtered_anonymized_texts]

avg_original_entropy = sum(original_entropy) / len(original_entropy)
avg_anonymized_entropy = sum(anonymized_entropy) / len(anonymized_entropy)

print(f"Average entropy of original texts: {avg_original_entropy}")
print(f"Average entropy of anonymized texts: {avg_anonymized_entropy}")

# Data Metrics for Data set II

In [None]:
nlp = spacy.load("en_core_web_sm")

def anonymize_narrative(text):
    if pd.isna(text):
        return text

    doc = nlp(str(text))
    anonymized_text = text

    for ent in doc.ents:
        anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")

    anonymized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', '[DATE]', anonymized_text)
    anonymized_text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', anonymized_text)
    phone_pattern = r'(\(\d{3}\)\d{3}-\d{4}|\d{3}-\d{3}-\d{4}|\+?\(?\d{1,3}\)?[-\s\.]?\d{1,3}[-\s\.]?\d{3}[-\s\.]?\d{4})'
    anonymized_text = re.sub(phone_pattern, '[PHONE]', anonymized_text)

    return anonymized_text

def evaluate_anonymization(original_texts, anonymized_texts):
    y_true = []
    y_pred = []

    for original, anonymized in zip(original_texts, anonymized_texts):
        if pd.isna(original) or pd.isna(anonymized):
            continue

        doc_original = nlp(str(original))
        doc_anonymized = nlp(str(anonymized))


        original_entities = set(ent.text for ent in doc_original.ents)
        anonymized_entities = set(ent.text for ent in doc_anonymized.ents)

        for ent in original_entities:
            y_true.append(1)
            if ent in anonymized_entities:
                y_pred.append(0)
            else:
                y_pred.append(1)

        for ent in anonymized_entities:
            if ent not in original_entities:
                y_true.append(0)
                y_pred.append(1)

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return {"precision": precision, "recall": recall, "f1_score": f1}


deanonymized_file_path = 'Project_Data/New Header 2023 DATA DEANONYMIZED 2.xlsx'
anonymized_again_file_path = 'Project_Data/New Header 2023 DATA REANONYMIZED 2.xlsx'


try:
    deanonymized_data = pd.read_excel(deanonymized_file_path)
except Exception as e:
    print(f"Error loading the file: {e}")
    raise


columns_to_anonymize = ['Assessment', 'History', 'Homeless', 'Intervention', 'MSEBehaviorDescription', 'Presenting Problem', 'Dispatch']


rows_changed = []


for i in range(len(deanonymized_data)):
    row_changed = False
    for column in columns_to_anonymize:
        original_text = deanonymized_data.at[i, column]
        new_text = anonymize_narrative(original_text)
        if original_text != new_text:
            deanonymized_data.at[i, column] = new_text
            row_changed = True
    if row_changed:
        rows_changed.append(i + 2)  


try:
    deanonymized_data.to_excel(anonymized_again_file_path, index=False)
    print(f"Anonymized data saved to {anonymized_again_file_path}")
except Exception as e:
    print(f"Error saving the file: {e}")
    raise


original_texts = deanonymized_data[columns_to_anonymize].values.flatten()
anonymized_texts = [anonymize_narrative(text) for text in original_texts]

results = evaluate_anonymization(original_texts, anonymized_texts)
print(results)


In [None]:
def calculate_entropy(text):

    if not text:
        return 0.0

    char_count = Counter(text)
    total_chars = len(text)
    
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in char_count.values())
    return entropy


filtered_original_texts = [text for text in original_texts if isinstance(text, str)]
filtered_anonymized_texts = [text for text in anonymized_texts if isinstance(text, str)]

original_entropy = [calculate_entropy(text) for text in filtered_original_texts]
anonymized_entropy = [calculate_entropy(text) for text in filtered_anonymized_texts]

avg_original_entropy = sum(original_entropy) / len(original_entropy)
avg_anonymized_entropy = sum(anonymized_entropy) / len(anonymized_entropy)

print(f"Average entropy of original texts: {avg_original_entropy}")
print(f"Average entropy of anonymized texts: {avg_anonymized_entropy}")