## Check named entities
Name; Email; Address; Phone number; URL; Date and time; Numbers (Phone numbers, Policy numbers, Zip code)

In [189]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("anonymization")
import re

from anonymization.Anonymization import Anonymization, AnonymizerChain
from anonymization.anonymizers import EmailAnonymizer, NamedEntitiesAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, MacAddressAnonymizer, CreditCardAnonymizer, IbanAnonymizer, SignatureAnonymizer, NumberAnonymizer
from collections import defaultdict

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
NamedEntitiesAnonymizer('en_core_web_lg'), 

(<function anonymization.anonymizers.spacyAnonymizers.NamedEntitiesAnonymizer.<locals>.<lambda>(anonymization)>,)

In [186]:
text = 'Hello Allen can you send the documents to Mengkai.Xu@libertymutual.com? My phone number is 6178607353, DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. \
My personal website is https://iyf.tv/. Here is my policy ## 1232343123232. Name driver license number is S39273383. My social security number is 822-32-5434. My TPI is 932-71-3232. \
Emily please let me know if 03/23/2023 works for you.'

# This example covers all possible entities and listed info from Katie
text1 = 'Hello I am Steve Curry, here is my email steve.curry@gmail.com. My address is 10 Greenwood St., Los Angeles, CA 01234. My number is 6178424332. I will arrive in Boston \
on 04/23/2023. My credit card number is 1232-3343-3443-4343. My medical information is SP32343232343433 and health insurance is 9SP00254099. My password is LM@MX1323223. My TPI is 932-71-3232.'

text2 = "That's correct. The policy was canceled effective 4/3/2022."
print(text)

Hello Allen can you send the documents to Mengkai.Xu@libertymutual.com? My phone number is 6178607353, DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. My personal website is https://iyf.tv/. Here is my policy ## 1232343123232. Name driver license number is S39273383. My social security number is 822-32-5434. My TPI is 932-71-3232. Emily please let me know if 03/23/2023 works for you.


In [194]:
anon = AnonymizerChain(Anonymization('en_US'))
anon.add_anonymizers(EmailAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, CreditCardAnonymizer, IbanAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg'), NumberAnonymizer)

In [195]:
# %%time
post_text = anon.anonymize(text1)
print(post_text)

email - steve.curry@gmail.com
phone_number - 6178424332
['Steve Curry', '10', 'Greenwood St.', 'Los Angeles', 'CA', 'Boston', '04/23/2023', '1232-3343-3443-4343', 'SP32343232343433', '9SP00254099', 'LM@MX1323223', 'TPI', '932']
['PERSON', 'CARDINAL', 'GPE', 'GPE', 'GPE', 'GPE', 'PERSON', 'CARDINAL', 'PRODUCT', 'DATE', 'ORG', 'ORG', 'CARDINAL']
name - Steve Curry
random_number - 10
address - Greenwood St.
address - Los Angeles
address - CA
address - Boston
name - 04/23/2023
random_number - 1232-3343-3443-4343
date - 9SP00254099
random_number - 932
random_number - 01234
random_number - SP32343232343433 
random_number - MX1323223
random_number - 71
random_number - 3232
Hello I am [name_removed], here is my email [email_removed]. My address is [number_removed] [address_removed], [address_removed], [address_removed] [number_removed]. My number is [number_removed]. I will arrive in [address_removed] on [name_removed]. My credit card number is [number_removed]. My medical information is [numb

In [159]:
subject = 'Hello [name_removed] can you send the documents to [email_removed]? My phone number is [number_removed], DOB is [date_removed]. Address is [number_removed] [address_removed], [address_removed], MA 02148. My personal website is [uri_removed] Here is my policy [number_removed]. Name driver license number is S39273383. My social security number is [number_removed]-32-5434. My TPI is [number_removed]-71-3232. [name_removed] please let me what if 03/23/2023 works for you.'

In [161]:
# subject = "I am still 434343"
result = re.sub(
    r"""(?x) # verbose regex
    \b    # Start of word
    (?=   # Look ahead to ensure that this word contains...
     \w*  # (after any number of alphanumeric characters)
     \d   # ...at least one digit.
    )     # End of lookahead
    \w+   # Match the alphanumeric word
    \s*   # Match any following whitespace""", 
    "[number_mx]", subject)
result

'Hello [name_removed] can you send the documents to [email_removed]? My phone number is [number_removed], DOB is [date_removed]. Address is [number_removed] [address_removed], [address_removed], MA [number_mx]. My personal website is [uri_removed] Here is my policy [number_removed]. Name driver license number is [number_mx]. My social security number is [number_removed]-[number_mx]-[number_mx]. My TPI is [number_removed]-[number_mx]-[number_mx]. [name_removed] please let me what if [number_mx]/[number_mx]/[number_mx]works for you.'

## Test section
#### Test spaCy normalization: NamedEntitiesAnonymizer

In [5]:
import spacy
processor = spacy.load("en_core_web_lg")

In [6]:
text = 'Hello Allen can you send the documents to [email_removed]? My phone number is [phone_number_removed], DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. \
My personal website is [uri_removed] Here is my policy ## 1232343123232. I am twenty five'
doc = processor(text)
print(doc)
# remove whitespace entities and trim the entities
ents = [ent.text.strip() for ent in doc.ents if not ent.text.isspace()]
labels = [ent.label_ for ent in doc.ents if not ent.text.isspace()] # Refer to difference labels here https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy
print(ents, labels)

Hello Allen can you send the documents to [email_removed]? My phone number is [phone_number_removed], DOB is 09/02/1990. Address is 10 Overlook Ridge, Malden, MA 02148. My personal website is [uri_removed] Here is my policy ## 1232343123232. I am twenty five
['Allen', 'DOB', '09/02/1990', '10', 'Overlook Ridge', 'Malden', '## 1232343123232', 'twenty five'] ['PERSON', 'ORG', 'DATE', 'CARDINAL', 'GPE', 'GPE', 'MONEY', 'CARDINAL']


In [140]:
doc.ents

tuple

#### Test fake module

In [13]:
from collections import defaultdict
from faker import Factory
import time
anonDicts = {}
faker = Factory.create('en_US')

def getFake(provider: str, match: str) -> str:
    '''
    Return the fake equivalent of match using a Faker provider

    Example:
            getFake(provider="date", match="09/02/1990") -> "2023-01-06"        
    '''
    print(anonDicts, provider)
    if not provider in anonDicts:
        anonDicts[provider] = defaultdict(getattr(faker, provider))
        a = getattr(faker, provider)
        anonDicts['test'] = defaultdict(a)
    
    print(anonDicts, anonDicts[provider][match], anonDicts['test'][match])
    print(anonDicts['test'][match])
    return anonDicts, anonDicts[provider][match]

In [14]:
dic, string = getFake('date', 'PERSON')

{} date
{'date': defaultdict(<bound method Provider.date of <faker.providers.date_time.en_US.Provider object at 0x7f43e5de2880>>, {'PERSON': '2022-05-25'}), 'test': defaultdict(<bound method Provider.date of <faker.providers.date_time.en_US.Provider object at 0x7f43e5de2880>>, {'PERSON': '1983-09-17'})} 2022-05-25 1983-09-17
1983-09-17


In [59]:
dir(fake)

['_Generator__config',
 '_Generator__format_token',
 '_Generator__random',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'add_provider',
 'address',
 'am_pm',
 'ascii_company_email',
 'ascii_email',
 'ascii_free_email',
 'ascii_safe_email',
 'bank_country',
 'bban',
 'binary',
 'boolean',
 'bothify',
 'bs',
 'building_number',
 'catch_phrase',
 'century',
 'chrome',
 'city',
 'city_prefix',
 'city_suffix',
 'color_name',
 'company',
 'company_email',
 'company_suffix',
 'coordinate',
 'country',
 'country_code',
 'credit_card_expire',
 'credit_card_full',
 'credit_card_number',
 'credit_card_provider',
 'credit_card_security_code',
 'cryptocurrency',
 'cryptocurrency

In [76]:
import pandas as pd

In [111]:
class TextMask:
    
    def __init__(self, anonymizers: list):
        self.anon = AnonymizerChain(Anonymization('en_US'))
        for ano in anonymizers:
            self.anon.add_anonymizers(ano)
            
    def anonymize(self, text: str):
        return self.anon.anonymize(text)
    
anonymizers = [EmailAnonymizer, PhoneNumberAnonymizer, UriAnonymizer, DateAnonymizer, CreditCardAnonymizer, IbanAnonymizer, NamedEntitiesAnonymizer('en_core_web_lg')]
anon = TextMask(anonymizers)

In [112]:
df = pd.read_csv('anonymization_test_data.csv')[['messageList.body']]

In [113]:
print(df['messageList.body'].shape)
df.head(10)

(1306,)


Unnamed: 0,messageList.body
0,Liberty Mutual: Thank you for quoting! Finish ...
1,Stop
2,Liberty Mutual: You have opted out of this mes...
3,Liberty Mutual: Thank you for quoting! Finish ...
4,Stop
5,Liberty Mutual: You have opted out of this mes...
6,Liberty Mutual: Thank you for quoting! Finish ...
7,Stop
8,Liberty Mutual: You have opted out of this mes...
9,Liberty Mutual: Thank you for quoting! Finish ...


In [114]:
df['messageList.body'].unique().shape

(129,)

In [115]:
df = pd.DataFrame({'original': df['messageList.body'].unique()})

In [117]:
df['anonymized'] = df['original'].apply(lambda x: anon.anonymize(x))

In [120]:
df.to_csv('anonymized_test_data.csv', index=False)