## Check named entities
Name; Email; Address; Phone number; URL; Date and time; Numbers (Phone numbers, Policy numbers, Zip code)

In [68]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("anonymization")
import re

from anonymization.Anonymization import Anonymization, AnonymizerChain
from anonymization.anonymizers import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, MacAddressAnonymizer, CreditCardAnonymizer, IbanAnonymizer, SignatureAnonymizer, NumberAnonymizer
from collections import defaultdict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
NamedEntitiesAnonymizer('en_core_web_lg'), 

(<function anonymization.anonymizers.spacyAnonymizers.NamedEntitiesAnonymizer.<locals>.<lambda>(anonymization)>,)

In [69]:
text = 'Hello Allen can you send the documents to Mengkai.Xu@libertymutual.com? My phone number is 6178607353, DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. \
My personal website is https://iyf.tv/. Here is my policy ## 1232343123232. Name driver license number is S39273383. My social security number is 822-32-5434. My TPI is 932-71-3232. \
Emily please let me know if 03/23/2023 works for you.'

# This example covers all possible entities and listed info from Katie
text1 = 'Hello I am Steve Curry, here is my email steve.curry@gmail.com. My address is 10 Greenwood St., Los Angeles, CA 01234. My number is 6178424332. I will arrive in Boston \
on 04/23/2023. My credit card number is 1232-3343-3443-4343. My medical information is SP32343232343433 and health insurance is 9SP00254099. My password is LM@MX1323223. My TPI is 932-71-3232. \
I can help you today, it will take hours to finish and is supposed to be done by 05/13/2023.'

text2 = "That's correct. The policy was canceled effective 4/3/2022. Here is my social 933-42-1212."
print(text1)

Hello I am Steve Curry, here is my email steve.curry@gmail.com. My address is 10 Greenwood St., Los Angeles, CA 01234. My number is 6178424332. I will arrive in Boston on 04/23/2023. My credit card number is 1232-3343-3443-4343. My medical information is SP32343232343433 and health insurance is 9SP00254099. My password is LM@MX1323223. My TPI is 932-71-3232. I can help you today, it will take hours to finish and is supposed to be done by 05/13/2023.


In [70]:
anon = AnonymizerChain(Anonymization('en_US'))
anon.add_anonymizers(EmailAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, CreditCardAnonymizer, IbanAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg'), NumberAnonymizer)

In [72]:
# %%time
post_text = anon.anonymize(text1)
print(post_text)

Hello I am [name_removed], here is my email [email_removed]. My address is [number_removed] [address_removed], [address_removed], [address_removed] [number_removed]. My number is [number_removed]. I will arrive in [address_removed] on [name_removed]. My credit card number is [number_removed]. My medical information is [number_removed]and health insurance is [date_removed]. My password is LM@[number_removed]. My TPI is [number_removed]-[number_removed]-[number_removed]. I can help you today, it will take hours to finish and is supposed to be done by [number_removed]/[number_removed]/[number_removed].


In [161]:
# subject = "I am still 434343"
result = re.sub(
    r"""(?x) # verbose regex
    \b    # Start of word
    (?=   # Look ahead to ensure that this word contains...
     \w*  # (after any number of alphanumeric characters)
     \d   # ...at least one digit.
    )     # End of lookahead
    \w+   # Match the alphanumeric word
    \s*   # Match any following whitespace""", 
    "[number_mx]", subject)
result

'Hello [name_removed] can you send the documents to [email_removed]? My phone number is [number_removed], DOB is [date_removed]. Address is [number_removed] [address_removed], [address_removed], MA [number_mx]. My personal website is [uri_removed] Here is my policy [number_removed]. Name driver license number is [number_mx]. My social security number is [number_removed]-[number_mx]-[number_mx]. My TPI is [number_removed]-[number_mx]-[number_mx]. [name_removed] please let me what if [number_mx]/[number_mx]/[number_mx]works for you.'

## Test section
#### Test spaCy normalization: NamedEntitiesAnonymizer

In [5]:
import spacy
processor = spacy.load("en_core_web_lg")

In [6]:
text = 'Hello Allen can you send the documents to [email_removed]? My phone number is [phone_number_removed], DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. \
My personal website is [uri_removed] Here is my policy ## 1232343123232. I am twenty five'
doc = processor(text)
print(doc)
# remove whitespace entities and trim the entities
ents = [ent.text.strip() for ent in doc.ents if not ent.text.isspace()]
labels = [ent.label_ for ent in doc.ents if not ent.text.isspace()] # Refer to difference labels here https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy
print(ents, labels)

Hello Allen can you send the documents to [email_removed]? My phone number is [phone_number_removed], DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. My personal website is [uri_removed] Here is my policy ## 1232343123232. I am twenty five
['Allen', 'DOB', '09/02/1990', '10', 'Overlook Ridge', 'Malden', '## 1232343123232', 'twenty five'] ['PERSON', 'ORG', 'DATE', 'CARDINAL', 'GPE', 'GPE', 'MONEY', 'CARDINAL']


In [140]:
doc.ents

tuple

#### Test fake module

In [13]:
from collections import defaultdict
from faker import Factory
import time
anonDicts = {}
faker = Factory.create('en_US')

def getFake(provider: str, match: str) -> str:
    '''
    Return the fake equivalent of match using a Faker provider

    Example:
            getFake(provider="date", match="09/02/1990") -> "2023-01-06"        
    '''
    print(anonDicts, provider)
    if not provider in anonDicts:
        anonDicts[provider] = defaultdict(getattr(faker, provider))
        a = getattr(faker, provider)
        anonDicts['test'] = defaultdict(a)
    
    print(anonDicts, anonDicts[provider][match], anonDicts['test'][match])
    print(anonDicts['test'][match])
    return anonDicts, anonDicts[provider][match]

In [14]:
dic, string = getFake('date', 'PERSON')

{} date
{'date': defaultdict(<bound method Provider.date of <faker.providers.date_time.en_US.Provider object at 0x7f43e5de2880>>, {'PERSON': '2022-05-25'}), 'test': defaultdict(<bound method Provider.date of <faker.providers.date_time.en_US.Provider object at 0x7f43e5de2880>>, {'PERSON': '1983-09-17'})} 2022-05-25 1983-09-17
1983-09-17


In [None]:
dir(fake)

In [73]:
import pandas as pd

In [74]:
class TextMask:
    
    def __init__(self, anonymizers: list):
        self.anon = AnonymizerChain(Anonymization('en_US'))
        for ano in anonymizers:
            self.anon.add_anonymizers(ano)
            
    def anonymize(self, text: str):
        return self.anon.anonymize(text)
    
anonymizers = [EmailAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, CreditCardAnonymizer, IbanAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg'), NumberAnonymizer]
anon = TextMask(anonymizers)

In [75]:
df = pd.read_csv('anonymization_test_data.csv')[['messageList.body']]

In [76]:
print(df['messageList.body'].shape)
df.head(10)

(50,)


Unnamed: 0,messageList.body
0,"Hello, I regret to hear you are looking to can..."
1,5
2,You would want to pay at lease the past due am...
3,Maine
4,We do not process payments on the weekend. Whe...
5,Liberty Mutual: How can I help you today? By t...
6,"Hi there! To help get you to the right place, ..."
7,Text us your customer e questions here! No tha...
8,782 at the end not 182
9,"Thank you for texting! For future needs, pleas..."


In [77]:
df['messageList.body'].unique().shape

(37,)

In [78]:
df = pd.DataFrame({'original': df['messageList.body'].unique()})

In [79]:
df['anonymized'] = df['original'].apply(lambda x: anon.anonymize(x))

In [80]:
df.to_csv('anonymized_test_data.csv', index=False)

In [81]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)
not has_numbers("hour")

True