In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Removes Personably Identifiable Pieces of Information

In [2]:
file_path = 'data/Fake_Dispatch_Data.csv'
data = pd.read_csv(file_path)

def anonymize_narrative(text):
    name_pattern = r'\b[A-Z][a-z]*[-\'\.]?[A-Z]?[a-z]+ [A-Z][a-z]*[-\'\.]?[A-Z]?[a-z]+\b'
    text = re.sub(name_pattern, '[Name]', text)
    
    date_pattern = r'\b(?:DOB:? ?|born on:? ?)?(0?[1-9]|[12][0-9]|3[01])[-/](0?[1-9]|1[012])[-/](19|20)?\d\d\b'
    text = re.sub(date_pattern, '[DOB]', text)

    return text

data['Anonymized Narrative'] = data['Narrative'].apply(anonymize_narrative)
data.drop(columns=['Narrative'], inplace=True)

anonymized_file_path = 'data/Fake_Dispatch_Data_first_pass.csv'
data.to_csv(anonymized_file_path, index=False)

data.head()

Unnamed: 0,Date,Time of Call,Age,Gender,Language,City,Reason for Dispatch,Anonymized Narrative
0,2024-01-03,03:39:41,46,Female,French,West Brittany,Water rescue,"Alert received regarding [Name], born 11/16/19..."
1,2024-01-08,14:55:13,58,Female,French,Port Michelle,Electrical hazard,"Emergency involving [Name], born [DOB] in [Nam..."
2,2024-02-25,02:17:29,61,Male,English,Lake Mary,Fire,"A report was made for [Name], who was born 03/..."
3,2024-05-02,19:06:44,54,Male,French,West Angelicashire,Medical emergency,"Emergency involving [Name], born [DOB] in [Nam..."
4,2024-01-26,15:37:53,68,Female,Spanish,Lake Andrew,Heart attack,"A report was made for [Name], who was born 05/..."


In [3]:
pd.read_csv('data/Fake_Dispatch_Data_first_pass.csv')

Unnamed: 0,Date,Time of Call,Age,Gender,Language,City,Reason for Dispatch,Anonymized Narrative
0,2024-01-03,03:39:41,46,Female,French,West Brittany,Water rescue,"Alert received regarding [Name], born 11/16/19..."
1,2024-01-08,14:55:13,58,Female,French,Port Michelle,Electrical hazard,"Emergency involving [Name], born [DOB] in [Nam..."
2,2024-02-25,02:17:29,61,Male,English,Lake Mary,Fire,"A report was made for [Name], who was born 03/..."
3,2024-05-02,19:06:44,54,Male,French,West Angelicashire,Medical emergency,"Emergency involving [Name], born [DOB] in [Nam..."
4,2024-01-26,15:37:53,68,Female,Spanish,Lake Andrew,Heart attack,"A report was made for [Name], who was born 05/..."
...,...,...,...,...,...,...,...,...
995,2024-03-01,23:25:55,45,Female,English,Nicholsfort,Theft,"[Name], a resident of Jonesside (born 02/15/19..."
996,2024-01-29,04:01:10,68,Female,English,Port Shari,Heart attack,"Incident report: [Name] (07/27/1955, born in [..."
997,2024-02-08,02:13:19,50,Female,French,East Greggtown,Water rescue,"[Name], born on 07/20/1973 at Jasmineshire, ca..."
998,2024-03-23,17:46:56,36,Male,French,Baileytown,Theft,"[Name], born on 10/29/1987 at Hurleyside, call..."


# Lets Try spaCY

In [4]:
anonymized_file_path = 'data/Fake_Dispatch_Data_first_pass.csv'
data.to_csv(anonymized_file_path, index=False)
anonymized_file_path

'data/Fake_Dispatch_Data_first_pass.csv'

In [5]:
nlp = spacy.load("en_core_web_sm")

def anonymize_narrative(text):
    doc = nlp(text)
    anonymized_text = text

    for ent in doc.ents:
        anonymized_text = anonymized_text.replace(ent.text, f"[{ent.label_}]")

    anonymized_text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', '[DATE]', anonymized_text)
    anonymized_text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w{2,4}\b', '[EMAIL]', anonymized_text)
    phone_pattern = r'(\(\d{3}\)\d{3}-\d{4}|\d{3}-\d{3}-\[[A-Z]+\]-\d{4}(x\d+)?|\[[A-Z]+\]-\d{4}x\d+|\d{3}-\d{3}-\[[A-Z]+\]-\d{4}x\d+|\+?\(?\d{1,3}\)?[-\s\.]?\d{1,3}[-\s\.]?\d{3}[-\s\.]?\d{4}|(?<=\])-\d{3}-\d{4})(x\d+)?'

    anonymized_text = re.sub(phone_pattern, '[PHONE]', anonymized_text)

    return anonymized_text

file_path = 'Fake_Data/Dispatch_Data_More_Varied.csv'
data = pd.read_csv(file_path)

data['Anonymized Narrative'] = data['Narrative'].apply(anonymize_narrative)
data.drop(columns=['Narrative'], inplace=True)

anonymized_file_path = 'data/Fake_Dispatch_Data_first_pass.csv'
data.to_csv(anonymized_file_path, index=False)

data.head()

Unnamed: 0,Date,Time of Call,Age,Gender,Language,City,Reason for Dispatch,Anonymized Narrative
0,2024-03-18,18:31:07,72,Male,French,Jefferybury,Domestic disturbance,"[PERSON], [ORG] in [GPE], required help for a ..."
1,2024-03-03,05:41:24,69,Male,English,New Caroltown,Fire,"[PERSON], born on [DATE] at [ORG], called due ..."
2,2024-01-13,05:27:33,30,Male,French,Jeffreyborough,Theft,"[PERSON] received regarding [PERSON], born [DA..."
3,2024-02-02,12:52:39,44,Male,French,Knightburgh,Water rescue,"[PERSON] received regarding [PERSON], born [DA..."
4,2024-04-28,12:50:26,26,Female,English,Devinfort,Fire,"[PERSON] received regarding [PERSON], born [DA..."


In [6]:
original_data = pd.read_csv('data/Fake_Dispatch_Data.csv')
anonymized_data = pd.read_csv('data/Fake_Dispatch_Data_first_pass.csv')

tfidf_vectorizer = TfidfVectorizer()
original_tfidf = tfidf_vectorizer.fit_transform(original_data['Narrative'])
anonymized_tfidf = tfidf_vectorizer.transform(anonymized_data['Anonymized Narrative'])

similarity_scores = cosine_similarity(original_tfidf, anonymized_tfidf)


average_similarity_score = similarity_scores.mean()

print("Average Cosine Similarity Score:", average_similarity_score)

Average Cosine Similarity Score: 0.09151148947839445


# Continuing my approach

In [7]:
anonymized_file_path = 'data/Fake_Dispatch_Data_first_pass.csv'
data = pd.read_csv(anonymized_file_path)

def contains_external_numbers(text):
    cleaned_text = re.sub(r'\[[^\]]+\]', '', text)
    print(f"Cleaned text: '{cleaned_text}'")
    contains_digits = any(char.isdigit() for char in cleaned_text)
    print(f"Contains digits: {contains_digits}") 
    return contains_digits

data['Contains Numbers'] = data['Anonymized Narrative'].apply(contains_external_numbers)

def highlight_numbers(text):
    return re.sub(r'\d+', '[NUMBER]', text)

data['Highlighted Anonymized Narrative'] = data.apply(
    lambda row: highlight_numbers(row['Anonymized Narrative']) if row['Contains Numbers'] else row['Anonymized Narrative'],
    axis=1
)

print(data[['Anonymized Narrative', 'Highlighted Anonymized Narrative', 'Contains Numbers']].head())

Cleaned text: ',  in , required help for a domestic disturbance. Contact via phone: ( or email: . Race: , follows .'
Contains digits: False
Cleaned text: ', born on  at , called due to fire. Reach at , . Caucasian, adheres to .'
Contains digits: False
Cleaned text: ' received regarding , born  in . Situation: theft. Call  or mail to . Profile: , religious belief: .'
Contains digits: False
Cleaned text: ' received regarding , born  in . Situation: water rescue. Call ( or mail to . Profile: , religious belief: .'
Contains digits: False
Cleaned text: ' received regarding , born  in . Situation: fire. Call  or mail to . Profile: , religious belief: .'
Contains digits: False
Cleaned text: ', born on  at , called due to traffic accident. Reach at , . , adheres to .'
Contains digits: False
Cleaned text: ', DOB:  in , required help for a fire. Contact via phone: ( or email: . Race: , follows .'
Contains digits: False
Cleaned text: ', :  in , required help for a theft. Contact via phone:  or em

In [8]:
anonymized_file_path = 'data/Fake_Dispatch_Data_first_pass.csv'
data = pd.read_csv(anonymized_file_path)

data.index += 2

def contains_external_numbers(text):
    cleaned_text = re.sub(r'\[[^\]]+\]', '', text)
    return any(char.isdigit() for char in cleaned_text)

data['Contains Numbers'] = data['Anonymized Narrative'].apply(contains_external_numbers)

def highlight_numbers(text):
    return re.sub(r'\d+', '[NUMBER]', text)

data['Highlighted Anonymized Narrative'] = data.apply(
    lambda row: highlight_numbers(row['Anonymized Narrative']) if row['Contains Numbers'] else row['Anonymized Narrative'],
    axis=1
)

rows_with_numbers = data[data['Contains Numbers']]

print(rows_with_numbers[['Anonymized Narrative', 'Highlighted Anonymized Narrative']])

                                  Anonymized Narrative  \
37   [PERSON], [ORG]: [DATE] in [GPE], required hel...   
94   [PERSON], [ORG]: [DATE] in [GPE], required hel...   
767  [PERSON], a resident of Robertview (born [DATE...   
821  Incident report: [PERSON] ([DATE], born in [PE...   
903  A report was made for [PERSON], who was born [...   
913  [PERSON] received regarding [ORG], born [ORG] ...   

                      Highlighted Anonymized Narrative  
37   [PERSON], [ORG]: [DATE] in [GPE], required hel...  
94   [PERSON], [ORG]: [DATE] in [GPE], required hel...  
767  [PERSON], a resident of Robertview (born [DATE...  
821  Incident report: [PERSON] ([DATE], born in [PE...  
903  A report was made for [PERSON], who was born [...  
913  [PERSON] received regarding [ORG], born [ORG] ...  
