In [1]:
import pandas as pd
import numpy as np
import re
import phonenumbers
import time

In [2]:
class PIIFilter:
    '''This class uses regex to tag different categories of PII from the data'''
    
    
    def identify_email(self, text):
        '''Function to tag emails from the data'''
        
        regex = "^\S+@\S+\.\S+$"
        email_check=[]
        for each_entry in text: 
            
            email = re.findall(regex,str(each_entry))
            if email:
                email_check.append(each_entry)
        
        if len(email_check)>=(len(text)*0.7):
            return " contains PII (email)."
        
        return ""
    
    
    def identify_ssn(self,text):
        '''Function to tag ssn from the data'''
        
        regex = "^(?!666|000|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0{4})\\d{4}$"
        ssn_check = []
        for each_entry in text:
            ssn = re.findall(regex,str(each_entry))
            if ssn:
                ssn_check.append(ssn)
        
        # if 70% data belongs to this category, put a label
       
        if len(ssn_check)>=(len(text)*0.7):
            return " contains PII (SSN)."
        return ""
    
    
    def identify_phone_numbers(self,text):
        '''Function to tag phone numbers from the data'''
        
        phone_list = []
        regex = "^\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*$"
        
        for each_entry in text:
            phone = re.findall(regex,str(each_entry))
            if phone:
                phone_list.append(each_entry)
            if len(phone_list)>=(len(text)*0.7):
                return " contains PII (phone number)."
        
        return ""
    
    def identify_postal_codes(self,text):
        '''Function to tag postal codes from the data'''
        
        postal_list = []
        regex = "^[0-9]{5}(?:-[0-9]{4})?$"
        
        for each_entry in text:
            postal = re.findall(regex,str(each_entry))
            if postal:
                postal_list.append(each_entry)
            if len(postal_list)>=(len(text)*0.7):
                return " contains PII (postal code)."
        return ""
    
    def identify_credit_card_number(self,text):
        '''Function to tag credit card numbers from the data'''
        
        cc_list = []
        regex = "(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)"
        for each_entry in text:
            each_entry = str(each_entry)
            each_entry = each_entry.replace("-","")
            cc = re.findall(regex,str(each_entry))
            if cc:
                cc_list.append(each_entry)
            if len(cc_list)>=(len(text)*0.7):
                return " contains PII (credit card number)."
        return ""
    
    def identify_drivers_license(self,text):
        '''Function to tag driving license from the data'''
        
        drivers_list = []
        regex = "^[A-Z](?:\d[- ]*){14}$"
        for each_entry in text:
            driver = re.findall(regex,str(each_entry))
            if driver:
                drivers_list.append(each_entry)
            if len(drivers_list)>=(len(text)*0.7):
                return " contains PII (Driving License number)."
        return ""
    
    def nlp_model(self,new_sample):
        counts = dict()

        for each_entry in new_sample:

            doc_stanza = nlp_stanza(str(each_entry)) 

            for ent in doc_stanza.ents:
                
                counts[ent.type] = counts.get(ent.type, 0) + 1
            dict(sorted(counts.items(), key=lambda item: item[1]))

        if counts:
            counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
            sortdict = dict(counts2)

            # get first key value pair from the dictionary
            first_pair = next(iter((sortdict.items())))

            #print('First Key: ', first_pair[0])
            #print('First Value: ', first_pair[1])
            if first_pair[1]>=(len(sortdict)*0.7):
                print(" contains PII " + first_pair[0] + ".")
    
    def filterPii(self, inputtext):
        analysis_text = ""
        analysis_text += self.identify_email(inputtext)
        analysis_text += self.identify_ssn(inputtext)
        analysis_text += self.identify_phone_numbers(inputtext)
        analysis_text += self.identify_postal_codes(inputtext)
        analysis_text += self.identify_credit_card_number(inputtext)
        analysis_text += self.identify_drivers_license(inputtext)
        
        #if analysis_text == "":
            #print("This may not contain PII.")
            #return
        return analysis_text
        


## Load Datasets

In [4]:
patients_data = pd.read_csv ('Patients_clean.csv')
provider_data = pd.read_csv ('provider_clean.csv')
sale_data = pd.read_csv ('sale_clean.csv')
dummy_data = pd.read_csv ('dummydata_clean.csv')

### We would be using sales data  

In [4]:
sale_data.head

<bound method NDFrame.head of          order_id  order_date      status  item_id                    sku  \
0       100354678   2020/10/1    received   574772     oasis_Oasis-064-36   
1       100354678   2020/10/1    received   574774        Fantastic_FT-48   
2       100354680   2020/10/1    complete   574777        mdeal_DMC-610-8   
3       100354680   2020/10/1    complete   574779     oasis_Oasis-061-36   
4       100367357  2020/11/13    received   595185    MEFNAR59C38B6CA08CD   
...           ...         ...         ...      ...                    ...   
286387  100562365   2021/9/30        paid   905179    APPCHA5AF14939B8F8A   
286388  100562376   2021/9/30         cod   905191    MEFCOT5A8D1E973B886   
286389  100562383   2021/9/30         cod   905200  WOFVAL59D5EA84167F9-M   
286390  100562384   2021/9/30         cod   905202  WOFNIG5B4D7EB0E9FDD-L   
286391  100562386   2021/9/30  processing   905205    MATHUA5AF70A7D1E50A   

        qty_ordered   price   value  discount

In [5]:
a = PIIFilter()

In [6]:
random_sample = sale_data['SSN'].sample(n=1000, random_state = 101)
a.filterPii(random_sample)

' contains PII (SSN).'

In [7]:
?random_sample

In [8]:
columns = list(sale_data)
random_sample = sale_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = a.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
    else:
        print("The column " + column + str(result))

The column order_id may not contain PII.
The column order_date may not contain PII.
The column status may not contain PII.
The column item_id may not contain PII.
The column sku may not contain PII.
The column qty_ordered may not contain PII.
The column price may not contain PII.
The column value may not contain PII.
The column discount_amount may not contain PII.
The column total may not contain PII.
The column category may not contain PII.
The column payment_method may not contain PII.
The column bi_st may not contain PII.
The column cust_id contains PII (postal code).
The column year may not contain PII.
The column month may not contain PII.
The column ref_num may not contain PII.
The column Name Prefix may not contain PII.
The column First Name may not contain PII.
The column Middle Initial may not contain PII.
The column Last Name may not contain PII.
The column Gender may not contain PII.
The column age may not contain PII.
The column full_name may not contain PII.
The column E M

## Dummy dataset 

In [9]:
b = PIIFilter()

In [10]:
columns = list(dummy_data)
random_sample = dummy_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = b.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
    else:
        print("The column " + column + str(result))

The column Full_Name may not contain PII.
The column SSN contains PII (SSN).
The column Credit_Card_Number contains PII (credit card number).


## patients dataset

In [11]:
patients_data.head

<bound method NDFrame.head of                                           Id   BIRTHDATE          SSN  \
0       1ff7f10f-a204-4bb1-aa72-dd763fa99482   8/24/2017  999-68-6630   
1       9bcf6ed5-d808-44af-98a0-7d78a29ede72    8/1/2016  999-15-5895   
2       5163c501-353c-4a82-b863-a3f1df2d6cf1    1/9/2004  999-73-2461   
3       cc3c806f-4a09-4a89-a990-4286450956be  11/15/1996  999-60-7372   
4       bd1c4ffc-7f1d-4590-adbb-1d6533fb623e   6/12/2019  999-81-4349   
...                                      ...         ...          ...   
124145  1ecfda69-7afc-4417-8a6f-c00be1be96dc  12/26/1934  999-42-9136   
124146  8cf835a7-f161-4fe9-a559-350c97a3450e   9/15/1944  999-62-9011   
124147  503d768f-481c-46e2-bcdb-a6116686351a  12/26/1934  999-19-6698   
124148  2599e9d9-ca59-44ec-a28c-9eae219f162d  12/26/1934  999-89-4620   
124149  f8d85cff-037c-4313-9448-14ac57d586a8  12/26/1934  999-45-7864   

          DRIVERS    PASSPORT      FIRST       LAST MARITAL   RACE  \
0             NaN      

In [12]:
pd.isna(patients_data).sum()
patients_data.fillna('unknown', inplace=True)

In [13]:
pd.isna(patients_data).sum()
pd.isna(sale_data).sum()

order_id            0
order_date          0
status              0
item_id             0
sku                 0
qty_ordered         0
price               0
value               0
discount_amount     0
total               0
category            0
payment_method      0
bi_st               0
cust_id             0
year                0
month               0
ref_num             0
Name Prefix         0
First Name          0
Middle Initial      0
Last Name           0
Gender              0
age                 0
full_name           0
E Mail              0
Customer Since      0
SSN                 0
Phone No.           0
Place Name          0
County              0
City                0
State               0
Zip                 0
Region              0
User Name           0
Discount_Percent    0
dtype: int64

## spacy

In [18]:
!pip install spacy
!python -m spacy download en_core_web_sm



2022-11-09 13:53:28.055931: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2022-11-09 13:53:28.056315: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-09 13:53:28.058987: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: LAPTOP-6CPLOTQ1
2022-11-09 13:53:28.059052: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: LAPTOP-6CPLOTQ1



[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [11]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [21]:
#columns = list(sale_data)
random_sample = sale_data.sample(n=1000, random_state = 121)


In [22]:
result = ""
new_sample = random_sample["Last Name"]
counts = dict()
?counts
for each_entry in new_sample:
    doc = nlp(each_entry)
    
    for ent in doc.ents:
        #print(ent.text, "|", ent.label_)
        counts[ent.label_] = counts.get(ent.label_, 0) + 1
    #dict(sorted(counts.items(), key=lambda item: item[1]))
    #print(counts)
    
counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
sortdict = dict(counts2)
print(sortdict)

# get first key value pair from the dictionary
first_pair = next(iter((sortdict.items())))

print('First Key: ', first_pair[0])
print('First Value: ', first_pair[1])

{'ORG': 227, 'GPE': 90, 'PERSON': 89, 'PRODUCT': 12, 'NORP': 7, 'DATE': 3, 'FAC': 2, 'WORK_OF_ART': 1}
First Key:  ORG
First Value:  227


In [6]:
!pip install stanza
import stanza



## stanza

In [7]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-12-05 19:56:05 INFO: Downloading default packages for language: en (English) ...
2022-12-05 19:56:06 INFO: File exists: C:\Users\mchou\stanza_resources\en\default.zip
2022-12-05 19:56:10 INFO: Finished downloading models and saved to C:\Users\mchou\stanza_resources.


In [8]:
# initialize stanza pipeline.

nlp_stanza = stanza.Pipeline(lang='en', processors='tokenize,ner')

2022-12-05 19:56:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-12-05 19:56:11 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-12-05 19:56:11 INFO: Use device: cpu
2022-12-05 19:56:11 INFO: Loading: tokenize
2022-12-05 19:56:11 INFO: Loading: ner
2022-12-05 19:56:11 INFO: Done loading processors!


In [34]:
random_sample = sale_data.sample(n=1000, random_state = 121)
result = ""
new_sample = random_sample["Last Name"]
counts = dict()
#?counts
for each_entry in new_sample:
    doc_stanza = nlp_stanza(str(each_entry))
    
    for ent in doc_stanza.ents:
        #print(f'{ent.text}_{ent.type}')
        counts[ent.type] = counts.get(ent.type, 0) + 1
    dict(sorted(counts.items(), key=lambda item: item[1]))
    #print(counts)
    
counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
sortdict = dict(counts2)
print(sortdict)

print(counts)


# get first key value pair from the dictionary
first_pair = list(sortdict.items())[0]

print('First Key: ', first_pair[0])
print('First Value: ', first_pair[1])
if first_pair[1]>=(len(sortdict)*0.7):
    print(" contains PII " + first_pair[0] + ".")

{'PERSON': 856, 'GPE': 9, 'ORG': 8, 'DATE': 1}
{'PERSON': 856, 'ORG': 8, 'DATE': 1, 'GPE': 9}
First Key:  PERSON
First Value:  856
 contains PII PERSON.


In [35]:
columns = list(sale_data)
random_sample = sale_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    counts = dict()
    #?counts
    for each_entry in new_sample:
        #print(each_entry)
        doc_stanza = nlp_stanza(str(each_entry)) 

        for ent in doc_stanza.ents:
            print(f'{ent.text}_{ent.type}')
            counts[ent.type] = counts.get(ent.type, 0) + 1
        dict(sorted(counts.items(), key=lambda item: item[1]))
        #print(counts)

    if counts:
        counts2_stanza = sorted(counts.items(), key=lambda item: item[1],reverse=True)
        sortdict_stanza = dict(counts2_stanza)
        #print(sortdict_stanza)

        # get first key value pair from the dictionary
        first_pair = next(iter((sortdict.items())))

        #print('First Key: ', first_pair[0])
        #print('First Value: ', first_pair[1])
        if first_pair[1]>=(len(sortdict)*0.7):
            print(" contains PII " + first_pair[0] + ".")

Samsung_ORG
MATSAM5A81910170A08_PERSON
MEFFIS5A9FD9FABB2F0_PERSON
MEFQQ59BAA5D39B2AF_PERSON
MATKIN59ACA113B0ABA_PERSON
BKSZIA59AE4AE25A595_PERSON
KABSEN59C1056BB52B4_PERSON
ENTECO5A7FECFAE81FB_PERSON
ENTPAN5A0CACC6C5FEC_PERSON
BAGKEM5A534BA655003_PERSON
HALMAN5AFD5197EF8AE_PERSON
HASALA59BA8DE5AFEA2_PERSON
BAGJUN59C10C183E903_PERSON
Nimcos_Gathia_PERSON
MATSAM5A7DBD43A7652_PERSON
MATSAM5A7DBD43A7652_PERSON
Lenovo_ORG
MATSAM5A7DBD43A7652_PERSON
MATTEL59BAA59306FF0_PERSON
HASQAR5A5369144BA78_PERSON
RS_Soan Papri-250gm_PERSON
HALOME5A097388BA6AF_PERSON
WOFGUL5ACDFBAFE3BAB_PERSON
MATHUA59B7EAABAA0AE_PERSON
{'PERSON': 22, 'ORG': 2}
First Key:  PERSON
First Value:  856
 contains PII PERSON.
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
1_CARDINAL
3_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
21_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
3_CARDINAL
5_CARDINAL
6_CARDINAL
6_CARDINAL
2_CARDINAL
4_CARDINAL
3_CARDINAL
1_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDIN

3_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
3_CARDINAL
11_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
3_CARDINAL
1_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDINAL
3_CARDINAL
1_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
6_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
3_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
2_CARDINAL
1_CARDINAL
2_CARDINAL
2_CARDINAL
4_CARDINAL
3_CARDINAL
6_CARDINAL
1_CARDINAL
2_CARDINAL
4_CARDINAL
2_CARDINA

49.9_CARDINAL
34.2_CARDINAL
49.9_CARDINAL
794.5_CARDINAL
2968.8_CARDINAL
79.5_CARDINAL
79.5_CARDINAL
52.0_CARDINAL
199.9_CARDINAL
84.9_CARDINAL
794.5_CARDINAL
149.9_CARDINAL
86.8_CARDINAL
1400.0_CARDINAL
149.9_CARDINAL
178.2_CARDINAL
200.0_CARDINAL
50.0_CARDINAL
4.6_CARDINAL
11.5_CARDINAL
13.3_CARDINAL
842.6_CARDINAL
95.0_CARDINAL
1200.0_CARDINAL
59.9_CARDINAL
98.0_CARDINAL
670.0_CARDINAL
28.0_CARDINAL
479.1_CARDINAL
47.5_CARDINAL
59.9_CARDINAL
176.9_CARDINAL
2657.2_CARDINAL
272.6_CARDINAL
1376.3_CARDINAL
589.9_CARDINAL
79.5_CARDINAL
129.9_CARDINAL
954.0_CARDINAL
50.0_CARDINAL
195.6_CARDINAL
100.0_CARDINAL
47.0_CARDINAL
50.0_CARDINAL
79.0_CARDINAL
130.4_CARDINAL
90.8_CARDINAL
95.9_CARDINAL
75.0_CARDINAL
2845.7_CARDINAL
109.4_CARDINAL
69.9_CARDINAL
36.3_CARDINAL
50.0_CARDINAL
89.8_CARDINAL
49.9_CARDINAL
44.9_CARDINAL
1108.2_CARDINAL
999.9_CARDINAL
1999.8_CARDINAL
7.9_CARDINAL
50.0_CARDINAL
30.0_CARDINAL
6.5_CARDINAL
100.0_CARDINAL
48.45_CARDINAL
50.0_CARDINAL
1579.9_CARDINAL
89.0_CARDIN

360.0_CARDINAL
1147.4_CARDINAL
279.0_CARDINAL
69.9_CARDINAL
900.0_CARDINAL
59.9_CARDINAL
244.7_CARDINAL
39.9_CARDINAL
54.9_CARDINAL
27.9_CARDINAL
59.8_CARDINAL
93.4_CARDINAL
240.0_CARDINAL
99.9_PERCENT
25.0_CARDINAL
28.6_CARDINAL
15.0_CARDINAL
69.9_CARDINAL
59.9_CARDINAL
43.8_CARDINAL
114.8_CARDINAL
78.0_CARDINAL
1278.9_CARDINAL
150.0_CARDINAL
104.5_CARDINAL
1255.1_CARDINAL
49.0_CARDINAL
86.8_CARDINAL
140.0_CARDINAL
99.8_CARDINAL
84.9_CARDINAL
13.3_CARDINAL
69.9_CARDINAL
119.0_CARDINAL
75.9_CARDINAL
400.0_CARDINAL
599.8_CARDINAL
69.5_CARDINAL
1957.4_CARDINAL
38.7_CARDINAL
64.2_CARDINAL
810.6_CARDINAL
200.0_CARDINAL
397.5_CARDINAL
179.2_CARDINAL
64.4_CARDINAL
500.0_CARDINAL
886.5_CARDINAL
399.8_CARDINAL
57.0_CARDINAL
200.0_CARDINAL
2958.8_CARDINAL
49.9_CARDINAL
36.0_CARDINAL
21.2_CARDINAL
7.4_CARDINAL
179.9_CARDINAL
170.5_CARDINAL
34.9_CARDINAL
199.9_CARDINAL
23.7_CARDINAL
9.631_CARDINAL
300.0_CARDINAL
249.0_CARDINAL
180.0_CARDINAL
69.9_CARDINAL
49.9_CARDINAL
44.9_CARDINAL
600.0_CARDINA

24.9_CARDINAL
323.8_CARDINAL
240.0_CARDINAL
1958.0_CARDINAL
99.8_CARDINAL
259.8_CARDINAL
41.5_CARDINAL
23.5_CARDINAL
49.0_CARDINAL
35.0_CARDINAL
179.9_CARDINAL
72.0_CARDINAL
144.0_CARDINAL
734.0_CARDINAL
100.0_CARDINAL
12.5_CARDINAL
109.8_CARDINAL
100.0_CARDINAL
1200.0_CARDINAL
150.0_CARDINAL
838.9_CARDINAL
250.0_CARDINAL
1458.4_CARDINAL
69.9_CARDINAL
209.8_CARDINAL
234.8_CARDINAL
99.8_CARDINAL
16.8_CARDINAL
104.0_CARDINAL
2.7_CARDINAL
200.0_CARDINAL
{'CARDINAL': 704, 'PERCENT': 11}
First Key:  PERSON
First Value:  856
 contains PII PERSON.
96.35_CARDINAL
1050.0_CARDINAL
31.96_CARDINAL
300.0_CARDINAL
43.7_CARDINAL
79.35_CARDINAL
1400.0_CARDINAL
100.0_CARDINAL
572.8_CARDINAL
176.84_CARDINAL
470.0_CARDINAL
100.0_CARDINAL
32.94_CARDINAL
140.0_CARDINAL
1700.0_CARDINAL
150.0_CARDINAL
241.29_CARDINAL
200.0_CARDINAL
265.23_CARDINAL
1600.0_CARDINAL
251.46_CARDINAL
240.8_CARDINAL
579.96_CARDINAL
142.095_CARDINAL
839.52_CARDINAL
305.73_CARDINAL
234.58_CARDINAL
203.0_CARDINAL
87.64_CARDINAL
40.0_

130.4_CARDINAL
90.8_CARDINAL
95.9_CARDINAL
75.0_CARDINAL
109.4_CARDINAL
69.9_CARDINAL
32.67_CARDINAL
250.0_CARDINAL
89.8_CARDINAL
44.9_CARDINAL
1999.8_CARDINAL
250.0_CARDINAL
30.0_CARDINAL
325.0_CARDINAL
300.0_CARDINAL
48.45_CARDINAL
250.0_CARDINAL
89.0_CARDINAL
125.0_CARDINAL
46.22_CARDINAL
1395.0_CARDINAL
458.0_CARDINAL
357.44_CARDINAL
325.95_CARDINAL
179.8_CARDINAL
40.0_CARDINAL
374.65_CARDINAL
123.6_CARDINAL
200.0_CARDINAL
39.9_CARDINAL
308.0_CARDINAL
69.9_CARDINAL
29.9_CARDINAL
1955.6_CARDINAL
400.0_CARDINAL
44.01_CARDINAL
100.0_CARDINAL
1199.9_CARDINAL
10.0_CARDINAL
160.0_CARDINAL
1799.9_CARDINAL
300.0_CARDINAL
721.0_CARDINAL
1555.5_CARDINAL
400.0_CARDINAL
37.5_CARDINAL
34.9_CARDINAL
49.9_CARDINAL
260.0_CARDINAL
35.5_CARDINAL
239.84_CARDINAL
70.0_CARDINAL
1600.0_CARDINAL
125.0_CARDINAL
59.9_CARDINAL
599.9_CARDINAL
85.0_CARDINAL
63.0_CARDINAL
254.0_CARDINAL
100.0_CARDINAL
56.8165_CARDINAL
16.5_CARDINAL
16.8_CARDINAL
1197.0_CARDINAL
778.9_CARDINAL
249.9_CARDINAL
41.7_CARDINAL
694.5

Men's Fashion_ORG
Beauty & Grooming_ORG
Soghaat_PERSON
Beauty & Grooming_ORG
Beauty & Grooming_ORG
Men's Fashion_ORG
Home & Living_ORG
Health & Sports_ORG
Men's Fashion_ORG
Home & Living_ORG
Men's Fashion_ORG
School & Education_ORG
Men's Fashion_ORG
Home & Living_ORG
Kids & Baby_ORG
Beauty & Grooming_ORG
Men's Fashion_ORG
Health & Sports_ORG
Health & Sports_ORG
Men's Fashion_ORG
Home & Living_ORG
Beauty & Grooming_ORG
Beauty & Grooming_ORG
School & Education_ORG
Beauty & Grooming_ORG
Health & Sports_ORG
Beauty & Grooming_ORG
Health & Sports_ORG
Home & Living_ORG
Beauty & Grooming_ORG
Soghaat_PERSON
Health & Sports_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Home & Living_ORG
Soghaat_PERSON
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Beauty & Grooming_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Men's Fashion_ORG
Beauty & Grooming_ORG
Men's Fashion_ORG
Soghaat_PERSON
Men's Fashion_ORG
Beauty & Grooming_ORG
Men's Fa

Easypay_MA_ORG
Easypay_PERSON
Easypay_PERSON
Payaxis_PERSON
Payaxis_PERSON
Easypay_PERSON
Payaxis_PERSON
Easypay_MA_ORG
Easypay_PERSON
Payaxis_PERSON
Payaxis_PERSON
Easypay_MA_ORG
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Payaxis_PERSON
Payaxis_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_MA_ORG
Easypay_MA_ORG
Payaxis_PERSON
Easypay_MA_ORG
Payaxis_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_MA_ORG
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_MA_ORG
Easypay_PERSON
Payaxis_PERSON
Easypay_PERSON
Easypay_PERSON
Payaxis_PERSON
Easypay_PERSON
Easypay_PERSON
Payaxis_PERSON
Payaxis_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Easypay_PERSON
Payaxis_PERSON
Easypay_PERSON
Payaxis_PERSON
Easypay_PERSON
{'PERSON': 348, 'ORG': 43}
First Key:  PERSON
First Value:  856
 contains PII PERSON.
56_CARDINAL
800_CARDINAL
141_CARDINAL
56_CARDINAL
180_CARDINAL
1963_DATE
114_CARDINAL
196

Margorie_PERSON
Dan_PERSON
Eli_PERSON
Agueda_PERSON
Roman_PERSON
Lyle_PERSON
Venessa_PERSON
Delmar_PERSON
Davis_PERSON
Elois_PERSON
Britt_PERSON
Sulema_PERSON
Monroe_GPE
Liz_PERSON
Jackie_PERSON
Bret_PERSON
Renato_PERSON
Irving_PERSON
Trena_PERSON
Edmond_PERSON
Faye_PERSON
Arnulfo_PERSON
Deandre_PERSON
Arleen_PERSON
Lester_PERSON
Carolina_GPE
Lucina_PERSON
Vilma_PERSON
Wilson_PERSON
Eugenio_PERSON
Jenny_PERSON
Hortencia_PERSON
Jona_PERSON
Troy_PERSON
Roosevelt_PERSON
Myrl_PERSON
Gregory_PERSON
Nestor_PERSON
Laurinda_PERSON
Reyes_PERSON
Pearl_PERSON
Joel_PERSON
Fidela_PERSON
Terrence_PERSON
Bert_PERSON
Marie_PERSON
Eloisa_PERSON
Hal_PERSON
Alejandro_PERSON
Angelo_PERSON
Cyrstal_PERSON
Willette_PERSON
Dwight_PERSON
Karl_PERSON
Sherri_PERSON
Brooks_PERSON
Sherita_PERSON
Joel_PERSON
Darlene_PERSON
Genevieve_PERSON
Hailey_PERSON
Janey_PERSON
Josefa_PERSON
Shanell_PERSON
Brett_PERSON
Jose_PERSON
Donella_PERSON
Timothy_PERSON
Melva_PERSON
Cory_PERSON
Sharyl_PERSON
Gregory_PERSON
Christoper_PE

Anjelica_PERSON
Magan_PERSON
Roland_PERSON
Joel_PERSON
Julius_PERSON
Emmett_PERSON
Dalia_PERSON
Shaunna_PERSON
Alfred_PERSON
Ray_PERSON
Odessa_GPE
Nicolasa_PERSON
Jordan_PERSON
Nilsa_PERSON
Craig_PERSON
Gerard_PERSON
Vasiliki_PERSON
Peter_PERSON
Williams_PERSON
Malissa_PERSON
Sheridan_PERSON
Dallas_GPE
Max_PERSON
Deetta_PERSON
Marlin_PERSON
Ignacio_PERSON
Sheryl_PERSON
Hailey_PERSON
Jake_PERSON
Kelsie_PERSON
April_DATE
Melida_PERSON
Luke_PERSON
Troy_PERSON
Alona_PERSON
Mazie_PERSON
Barbara_PERSON
Norbert_PERSON
Salvatore_PERSON
Talisha_PERSON
Tawny_PERSON
Lizette_PERSON
Anjelica_PERSON
Marshall_PERSON
Fausto_PERSON
Casey_PERSON
Caleb_PERSON
Alfonso_PERSON
Ethel_PERSON
Terrence_PERSON
Sonny_PERSON
Paris_GPE
Chang_PERSON
Rueben_PERSON
Cody_PERSON
Luba_PERSON
Stevie_PERSON
Angeles_GPE
Kandra_PERSON
Jules_PERSON
Edelmira_PERSON
Malissa_PERSON
Tish_PERSON
Nova_PERSON
{'PERSON': 936, 'GPE': 24, 'DATE': 2, 'ORG': 2, 'NORP': 1}
First Key:  PERSON
First Value:  856
 contains PII PERSON.
Schiff_

Sheard_PERSON
Clyne_PERSON
Venne_PERSON
Caine_PERSON
Sapien_PERSON
Marceau_PERSON
Waggoner_PERSON
Batz_PERSON
Carl_PERSON
Greenberg_PERSON
Pickering_PERSON
Esser_PERSON
Shoup_PERSON
Moreland_PERSON
Coronado_GPE
Trafton_PERSON
Wethington_GPE
Leflore_PERSON
Schumacher_PERSON
Menendez_PERSON
Deckert_PERSON
Gendron_PERSON
Cheever_PERSON
Willilams_PERSON
Marceau_PERSON
Morissette_PERSON
Schrom_PERSON
Montemayor_PERSON
Loflin_PERSON
Whittemore_PERSON
Tibbits_PERSON
Demott_PERSON
Petties_PERSON
Mcquiston_PERSON
Kavanaugh_PERSON
Madson_PERSON
Roscoe_PERSON
Figaro_PERSON
Bester_PERSON
Weekley_PERSON
Herrod_PERSON
Abbate_PERSON
Carbaugh_PERSON
Hefley_PERSON
Kurt_PERSON
Albers_PERSON
Leedy_PERSON
Maury_PERSON
Bolanos_PERSON
Kocher_PERSON
Lichtenstein_PERSON
Gentner_PERSON
Kaiser_PERSON
Veitch_PERSON
Mcauley_PERSON
Brassell_PERSON
Westley_PERSON
Maciel_PERSON
Fondren_PERSON
Hardy_PERSON
Caine_PERSON
Maxie_PERSON
Arnold_PERSON
Stoddard_PERSON
Bisson_PERSON
Mccumber_PERSON
Ostby_PERSON
Stonge_PERSON

37_CARDINAL
32_CARDINAL
67_CARDINAL
24_CARDINAL
70_CARDINAL
41_CARDINAL
20_CARDINAL
68_CARDINAL
71_CARDINAL
23_CARDINAL
61_CARDINAL
49_CARDINAL
42_CARDINAL
38_CARDINAL
30_CARDINAL
46_CARDINAL
21_CARDINAL
64_CARDINAL
28_CARDINAL
72_CARDINAL
68_CARDINAL
61_CARDINAL
19_CARDINAL
42_CARDINAL
39_CARDINAL
42_CARDINAL
45_CARDINAL
74_CARDINAL
69_CARDINAL
24_CARDINAL
53_CARDINAL
44_CARDINAL
67_CARDINAL
24_CARDINAL
68_CARDINAL
47_CARDINAL
24_CARDINAL
40_CARDINAL
52_CARDINAL
71_CARDINAL
55_CARDINAL
37_CARDINAL
27_CARDINAL
32_CARDINAL
52_CARDINAL
45_CARDINAL
47_CARDINAL
32_CARDINAL
45_CARDINAL
35_CARDINAL
50_CARDINAL
45_CARDINAL
73_CARDINAL
72_CARDINAL
20_CARDINAL
58_CARDINAL
51_CARDINAL
48_CARDINAL
31_CARDINAL
51_CARDINAL
27_CARDINAL
42_CARDINAL
59_CARDINAL
65_CARDINAL
18_CARDINAL
39_CARDINAL
21_CARDINAL
67_CARDINAL
61_CARDINAL
56_CARDINAL
53_CARDINAL
51_CARDINAL
60_CARDINAL
21_CARDINAL
75_CARDINAL
32_CARDINAL
40_CARDINAL
40_CARDINAL
67_CARDINAL
47_CARDINAL
39_CARDINAL
42_CARDINAL
42_CARDINAL
63_C

61_CARDINAL
50_CARDINAL
69_CARDINAL
55_CARDINAL
63_CARDINAL
45_CARDINAL
32_CARDINAL
66_CARDINAL
33_CARDINAL
45_CARDINAL
24_CARDINAL
21_CARDINAL
29_CARDINAL
44_CARDINAL
42_CARDINAL
37_CARDINAL
50_CARDINAL
33_CARDINAL
64_CARDINAL
67_CARDINAL
37_CARDINAL
67_CARDINAL
33_CARDINAL
66_CARDINAL
44_CARDINAL
75_CARDINAL
46_CARDINAL
22_CARDINAL
24_CARDINAL
55_CARDINAL
73_CARDINAL
29_CARDINAL
68_CARDINAL
69_CARDINAL
60_CARDINAL
50_CARDINAL
43_CARDINAL
56_CARDINAL
39_CARDINAL
31_CARDINAL
65_CARDINAL
37_CARDINAL
72_CARDINAL
69_CARDINAL
72_CARDINAL
47_CARDINAL
39_CARDINAL
29_CARDINAL
39_CARDINAL
71_CARDINAL
22_CARDINAL
28_CARDINAL
19_CARDINAL
62_CARDINAL
41_CARDINAL
52_CARDINAL
70_CARDINAL
47_CARDINAL
31_CARDINAL
29_CARDINAL
35_CARDINAL
66_CARDINAL
21_CARDINAL
64_CARDINAL
35_CARDINAL
73_CARDINAL
71_CARDINAL
37_CARDINAL
52_CARDINAL
38_CARDINAL
65_CARDINAL
73_CARDINAL
24_CARDINAL
71_CARDINAL
25_CARDINAL
46_CARDINAL
19_CARDINAL
62_CARDINAL
68_CARDINAL
57_CARDINAL
38_CARDINAL
{'CARDINAL': 1000}
First Key

Tollison_PERSON
Lorelei_PERSON
Pickel_PERSON
Lionel_PERSON
Vannoy_PERSON
Lawanda_PERSON
Endsley_PERSON
Terence_PERSON
Flanigan_PERSON
Yong_PERSON
Troiano_PERSON
Tish_PERSON
Dusek_PERSON
Sophie_PERSON
Martine_PERSON
Theodore_PERSON
Les_PERSON
Flanders_GPE
Earnest_PERSON
Despres_PERSON
Emory_PERSON
Herrin_PERSON
Fidela_PERSON
Ekstrom_PERSON
Carina_PERSON
Barlowe_PERSON
Rudolf_PERSON
Hummel_PERSON
Starla_PERSON
Pierre_PERSON
Tomasa_PERSON
Amaya_PERSON
Laurence_PERSON
Mitzel_PERSON
Jarrett_PERSON
Jacobs_PERSON
Nanci_PERSON
Hajek_PERSON
Vernetta_PERSON
Dupras_PERSON
Hilaria_PERSON
Sharice_PERSON
Custis_PERSON
Marylou_PERSON
Roscoe_PERSON
Gabriel_PERSON
Baptiste_PERSON
Wesley_PERSON
Durgan_PERSON
Laurie_PERSON
Reay_PERSON
Garth_PERSON
Ezekiel_PERSON
Kaylene_PERSON
Dyal_PERSON
Carol_PERSON
Heintz_PERSON
Yang_PERSON
Beresford_PERSON
Tom_PERSON
Divito_PERSON
Sherlyn_PERSON
Elisha_PERSON
Colosimo_GPE
Eustolia_GPE
Welborn_PERSON
Corrie_PERSON
Kuebler_PERSON
Jimmie_PERSON
Li_PERSON
Bert_PERSON
Dob

Schumacher_PERSON
Celena_PERSON
Jenell_PERSON
Menendez_PERSON
Luciano_PERSON
Lakeesha_PERSON
Gendron_PERSON
Dorene_GPE
Rusty_PERSON
Willilams_PERSON
Terica_PERSON
Marceau_PERSON
Morissette_PERSON
Johnnie_PERSON
Schrom_PERSON
Coralee_PERSON
Montemayor_PERSON
Alan_PERSON
Loflin_PERSON
Mohammed_PERSON
Whittemore_PERSON
Bobbie_PERSON
Tibbits_PERSON
Cesar_PERSON
Demott_PERSON
Louise_PERSON
Petties_PERSON
Gilbert_PERSON
Dimmick_PERSON
Benita_PERSON
Mcquiston_GPE
Birgit_PERSON
Kavanaugh_PERSON
Oscar_PERSON
Sanora_PERSON
Madson_PERSON
Stephany_PERSON
Roscoe_PERSON
Sherrill_GPE
Figaro_PERSON
Joshua_PERSON
Bester_PERSON
Deborah_PERSON
Weekley_PERSON
Debbi_PERSON
Herrod_PERSON
Mayra_PERSON
Abbate_PERSON
Luke_PERSON
Carbaugh_PERSON
Dominic_PERSON
Hefley_PERSON
Randolph_PERSON
Kurt_PERSON
Deshawn_PERSON
Albers, Merrill_ORG
Leedy_PERSON
Savannah_GPE
Maury_PERSON
Hildegarde_PERSON
Bolanos_PERSON
Laurence_PERSON
Kocher_PERSON
Gaston_PERSON
Eulah_PERSON
Lichtenstein_GPE
Leola_PERSON
Gentner_PERSON
Sher

Fleischmann_PERSON
Darrell_PERSON
Mariscal_PERSON
Jon_PERSON
Lowenstein_PERSON
Georgette_PERSON
Sargent_PERSON
Lucien_PERSON
Lingle_PERSON
Olen_PERSON
Mccusker_PERSON
Doyle_PERSON
Severson_PERSON
Asa_PERSON
Nickens_PERSON
Sibyl_PERSON
Golliday_PERSON
Edith_PERSON
Yoho_PERSON
Jeanene_PERSON
Antoine_PERSON
Hitt_PERSON
Yasmine_PERSON
Malec_PERSON
Bailey_PERSON
Crigler_PERSON
Shela_PERSON
Teegarden_GPE
Michale_GPE
Eckenrode_GPE
Dwain_GPE
Gia_PERSON
Deshawn_PERSON
Casey_PERSON
Wehr_PERSON
Anjelica_PERSON
Epling_PERSON
Rene_PERSON
Granda_PERSON
Magan_GPE
Krout_PERSON
Roland_PERSON
Gonzalez_PERSON
Joel_PERSON
Dungan_GPE
Julius_PERSON
Talamantez_PERSON
Emmett_PERSON
Barrios_PERSON
Dalia_PERSON
Mcmichael_PERSON
Shaunna_PERSON
Byers_PERSON
Alfred_PERSON
Feeley_PERSON
Ray_PERSON
Grier_PERSON
Odessa_GPE
Nicolasa_PERSON
Jordan_PERSON
Nilsa_PERSON
Bray_PERSON
Craig_PERSON
Fuhr_PERSON
Gerard_PERSON
Mckelvey_PERSON
Vasiliki_PERSON
Buteau_PERSON
Peter_PERSON
Lafave_PERSON
Williams_PERSON
Rumfelt_PERSON

907-337-9004_TIME
217-276-2038_TIME
217-861-7640_TIME
316-248-9794_TIME
228-449-1342_TIME
219-750-3311_TIME
319-918-7998_TIME
303-461-9369_TIME
210-858-2862_TIME
212-976-1225_TIME
206-755-9496_TIME
202-324-7002_TIME
217-635-9180_TIME
480-471-1157_TIME
405-409-4529_TIME
217-861-7640_TIME
225-891-5249_TIME
479-729-9156_TIME
319-532-5081_TIME
205-667-4009_TIME
303-719-1639_TIME
339-297-4027_TIME
209-537-8015_TIME
231-544-4344_TIME
252-280-2062_TIME
423-420-5550_TIME
231-421-5434_TIME
210-965-7532_TIME
304-360-1480_TIME
229-642-7614_TIME
236-884-3043_TIME
229-417-1514_TIME
701-309-2532_TIME
307-313-5801_TIME
219-763-0128_TIME
605-873-0872_TIME
209-646-4807_TIME
319-664-4303_TIME
210-734-6405_TIME
205-976-4919_TIME
252-672-2257_TIME
209-365-1007_TIME
319-493-6052_TIME
209-776-8096_TIME
316-473-7115_TIME
304-566-7411_TIME
480-834-7845_TIME
209-823-8389_TIME
231-873-2561_TIME
219-918-9238_TIME
229-989-0667_TIME
479-310-8519_TIME
239-714-6494_TIME
219-546-2622_TIME
216-927-0141_TIME
304-341-39

304-838-5834_TIME
479-693-0400_TIME
236-532-1863_TIME
218-266-1067_TIME
907-280-5158_TIME
270-663-6086_TIME
212-283-0837_TIME
314-882-6188_TIME
209-526-7491_TIME
212-741-6430_TIME
210-727-3558_TIME
218-856-3568_TIME
319-685-3274_TIME
216-217-2170_TIME
239-349-1198_TIME
210-584-1412_TIME
209-442-4740_TIME
701-604-5553_TIME
339-476-1255_TIME
319-641-3780_TIME
236-633-9061_TIME
205-362-2381_TIME
702-433-7140_TIME
239-385-1119_TIME
218-274-9044_TIME
270-241-5847_TIME
203-963-9443_TIME
210-391-1749_TIME
236-884-4735_TIME
209-590-7080_TIME
217-482-2076_TIME
208-464-4867_TIME
215-701-2155_TIME
231-625-2004_TIME
307-748-2463_TIME
316-260-2805_TIME
217-861-7640_TIME
210-484-3660_TIME
228-733-7337_TIME
303-855-1758_TIME
236-803-8913_TIME
314-206-1814_TIME
808-766-7849_TIME
210-472-4652_TIME
209-572-3272_TIME
907-846-9147_TIME
405-772-1629_TIME
480-558-9764_TIME
236-292-2538_TIME
303-658-9304_TIME
802-803-2910_TIME
219-733-0818_TIME
401-425-8068_TIME
319-398-8275_TIME
239-910-7920_TIME
212-741-64

Milner_PERSON
Connellys Springs_PERSON
Mitchellsburg_GPE
Teton Village_LOC
Pasadena_GPE
Glenhaven_PERSON
Rochester_GPE
Saint Amant_PERSON
Camp Pendleton_FAC
Henderson_PERSON
Vaughan_PERSON
Miltonvale_PERSON
Miami_GPE
Cochise_PERSON
Clarksville_GPE
Cedar Grove_LOC
Frankfort_GPE
Sparta_GPE
Bouse_PERSON
Huntsville_GPE
Winter Haven_PERSON
Carlock_PERSON
Spearsville_GPE
Crawfordsville_GPE
Albany_GPE
Whitmire_PERSON
Lunenburg_GPE
Tahoe Vista_LOC
Sugar Grove_PERSON
Fulton_PERSON
Gaffney_PERSON
Berlin_GPE
Menomonie_PERSON
Landenberg_PERSON
Jesup_PERSON
San Antonio_GPE
Universal City_GPE
La Place_FAC
Tecumseh_PERSON
Merom_PERSON
Pueblo_GPE
Swanton_PERSON
Westerville_GPE
Pineville_GPE
Mims_PERSON
Turin_GPE
Magnolia_GPE
Ironton_PERSON
Le Grand_FAC
Chandler_PERSON
Brown City_GPE
Arlington_GPE
Amarillo_GPE
Cassville_GPE
Norfolk_GPE
Yankeetown_PERSON
Limon_GPE
Gastonia_GPE
San Jose_GPE
Silver Spring_PERSON
Elizabeth City_PERSON
Mcville_PERSON
Baldwin Park_PERSON
Dekalb_PERSON
Youngstown_GPE
Findlay_

McKinley_PERSON
Bertie_PERSON
El Paso_GPE
Barnwell_PERSON
Baxter_PERSON
Carlisle_PERSON
Turner_PERSON
Delaware_GPE
Henry_PERSON
Tolland_PERSON
San Patricio_LOC
Kane_PERSON
Rusk_PERSON
Allegan_PERSON
New York_GPE
Lehigh_ORG
Wilkes_PERSON
Monterey_GPE
Marion_PERSON
Columbia_GPE
Clark_PERSON
Victoria_PERSON
San Diego_GPE
Bell_PERSON
Los Angeles_GPE
Harris_PERSON
Murray_PERSON
Autauga_PERSON
Chenango_PERSON
Saluda_PERSON
Bedford_GPE
Sullivan_PERSON
Cook_PERSON
Sacramento_GPE
Calhoun_PERSON
North Slope_LOC
Fayette_GPE
Lawrence_PERSON
Wayne_PERSON
Sacramento_GPE
Fauquier_PERSON
Maricopa_GPE
Cook_PERSON
Broome_PERSON
Schuyler_PERSON
Lucas_PERSON
Mecklenburg_GPE
Baca_PERSON
Menard_PERSON
Saline_PERSON
Caledonia_GPE
Windham_PERSON
Clayton_PERSON
Ellis_PERSON
Cortland_GPE
Merrick_PERSON
Bradford_PERSON
Mohave_PERSON
Wheeler_PERSON
Pamlico_PERSON
Lafayette_GPE
Greene_PERSON
Camden_GPE
Fayette_GPE
Livingston_GPE
Lewis_PERSON
Butler_PERSON
Fayette_GPE
Kenton_PERSON
Tyler_PERSON
El Paso_GPE
Galvesto

Bailey_PERSON
Knox_PERSON
Fayette_GPE
Greene_PERSON
Jackson_PERSON
Middlesex_GPE
Randolph_PERSON
Indiana_GPE
Mecosta_PERSON
Harnett_PERSON
Fayette_GPE
Travis_PERSON
North Slope_LOC
Somerset_GPE
Perry_PERSON
Lafayette_GPE
Monterey_GPE
Herkimer_PERSON
Cerro Gordo_PERSON
Brevard_GPE
Anchorage Municipality_GPE
Hunterdon_PERSON
Charles_PERSON
Los Angeles_GPE
Santa Cruz_PERSON
Pontotoc_PERSON
Clark_PERSON
Crow Wing_PERSON
Mills_PERSON
Anoka_PERSON
Chester_PERSON
Bakersfield_GPE
Hennepin_PERSON
Fairfax_GPE
Palm Beach_GPE
Columbia_GPE
Barnstable_GPE
Santa Fe_PERSON
Chariton_PERSON
Polk_PERSON
Johnson_PERSON
Middlesex_GPE
Jim Wells_PERSON
Clark_PERSON
Douglas_PERSON
Middlesex_GPE
Rockingham_GPE
Okanogan_PERSON
Dallas_GPE
Vanderburgh_PERSON
Zapata_PERSON
Hamilton_PERSON
St. Clair_GPE
Davis_PERSON
Salem_GPE
Montgomery_GPE
Douglas_PERSON
Clark_PERSON
Geneva_GPE
Jefferson_PERSON
Harris_PERSON
Douglas_PERSON
Moffat_PERSON
Turner_PERSON
Montgomery_GPE
Los Angeles_GPE
Hamblen_PERSON
Adams_PERSON
Los A

Gillham_PERSON
Des Moines_GPE
Jack_PERSON
Denver_GPE
Ft Devens_PERSON
Concord_GPE
Trenton_GPE
Wichita_GPE
Big Bay_LOC
Montreat_PERSON
Atwood_PERSON
Lansing_GPE
San Antonio_GPE
Charleston_GPE
Jefferson_PERSON
Reston_GPE
Alamo_GPE
Sykeston_PERSON
Akron_GPE
Otto_PERSON
Goodland_PERSON
Crooks_PERSON
Cardiff By The Sea_PERSON
Larrabee_PERSON
Huntsville_GPE
Orrum_PERSON
Woodland Hills_GPE
Spirit Lake_LOC
Applegate_PERSON
Clyde_PERSON
Oatman_PERSON
Martinsville_GPE
Santa Clarita_PERSON
Douglas_PERSON
South Bend_GPE
Atlanta_GPE
Humphrey_PERSON
Saint Augustine_PERSON
Monterey_GPE
Lakeside Marblehead_LOC
Beards Fork_PERSON
Gleason_PERSON
Wilmington_GPE
Denver_GPE
Winchester_GPE
Matewan_PERSON
La Mesa_LOC
Thornton_PERSON
New Bloomington_GPE
Nye_PERSON
Hines_PERSON
Roderfield_PERSON
Kipnuk_PERSON
Edgerton_PERSON
Blackwell_PERSON
Atlanta_GPE
Finley_PERSON
Spearfish_PERSON
Simmesport_PERSON
West Palm Beach_GPE
Dekalb_PERSON
Noxapater_PERSON
New River_LOC
Valrico_PERSON
Accokeek_PERSON
Des Moines_GPE

15.0_CARDINAL
10.0_CARDINAL
10.0_CARDINAL
20.0_CARDINAL
15.0_CARDINAL
20.0_CARDINAL
20.0_CARDINAL
8.0_CARDINAL
20.0_CARDINAL
20.0_CARDINAL
10.0_CARDINAL
10.0_CARDINAL
9.494472107_TIME
10.0_CARDINAL
10.0_CARDINAL
15.0_CARDINAL
15.0_CARDINAL
7.5_CARDINAL
20.0_CARDINAL
5.0_CARDINAL
9.0_CARDINAL
10.0_CARDINAL
20.0_CARDINAL
20.0_CARDINAL
{'CARDINAL': 130, 'TIME': 2}
First Key:  PERSON
First Value:  856
 contains PII PERSON.


In [30]:
def nlp_model(new_sample):
    counts = dict()
    
    for each_entry in new_sample:
        
        doc_stanza = nlp_stanza(str(each_entry)) 

        for ent in doc_stanza.ents:
            #print(f'{ent.text}_{ent.type}')
            counts[ent.type] = counts.get(ent.type, 0) + 1
        dict(sorted(counts.items(), key=lambda item: item[1]))
        #print(counts)

    if counts:
        counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
        sortdict = dict(counts2)
        #print(sortdict)

        # get first key value pair from the dictionary
        first_pair = next(iter((sortdict.items())))

        #print('First Key: ', first_pair[0])
        #print('First Value: ', first_pair[1])
        if first_pair[1]>=(len(sortdict)*0.7):
            print(" contains PII " + first_pair[0] + ".")

In [31]:
a = PIIFilter()

In [32]:
random_sample = sale_data['SSN'].sample(n=1000, random_state = 101)
a.filterPii(random_sample)

' contains PII (SSN).'

In [33]:
columns = list(sale_data)
random_sample = sale_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = a.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
        nlp_model(new_sample)
    else:
        print("The column " + column + str(result))

The column order_id may not contain PII.
The column order_date may not contain PII.
The column status may not contain PII.
The column item_id may not contain PII.
The column sku may not contain PII.
 contains PII PERSON.
The column qty_ordered may not contain PII.
 contains PII CARDINAL.
The column price may not contain PII.
 contains PII CARDINAL.
The column value may not contain PII.
 contains PII CARDINAL.
The column discount_amount may not contain PII.
 contains PII CARDINAL.
The column total may not contain PII.
 contains PII CARDINAL.
The column category may not contain PII.
 contains PII ORG.
The column payment_method may not contain PII.
 contains PII PERSON.
The column bi_st may not contain PII.
The column cust_id contains PII (postal code).
The column year may not contain PII.
The column month may not contain PII.
 contains PII DATE.
The column ref_num may not contain PII.
The column Name Prefix may not contain PII.
The column First Name may not contain PII.
 contains PII PER

In [36]:
random_sample = dummy_data['Full_Name'].sample(n=1000, random_state = 101)

In [None]:
columns = list(dummy_data)
random_sample = dummy_data.sample(n=1000, random_state = 121)
column = "Full_Name"
result = ""
new_sample = random_sample["Full_Name"]
#?new_sample
result = nlp_model(new_sample)
if result == "": 
    print("The column " + column + " may not contain PII.")
    nlp_model(new_sample)
else:
    print("The column " + column + str(result))

In [9]:
# comparison between spacy and stanza

random_sample = sale_data.sample(n=1000, random_state = 121)
result = ""
new_sample = random_sample["Last Name"]
counts = dict()
#?counts

start_time = time.time()
for each_entry in new_sample:
    doc_stanza = nlp_stanza(str(each_entry))
    
    for ent in doc_stanza.ents:
        #print(f'{ent.text}_{ent.type}')
        counts[ent.type] = counts.get(ent.type, 0) + 1
    dict(sorted(counts.items(), key=lambda item: item[1]))
    #print(counts)
    
counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
sortdict = dict(counts2)
#print(sortdict)

#print(counts)


# get first key value pair from the dictionary
first_pair = list(sortdict.items())[0]

print('First Key: ', first_pair[0])
print('First Value: ', first_pair[1])
if first_pair[1]>=(len(sortdict)*0.7):
    print(" contains PII " + first_pair[0] + ".")
    print("Time to detection: %s seconds " % (time.time() - start_time))
    

First Key:  PERSON
First Value:  854
 contains PII PERSON.
Time to detection: 26.106595039367676 seconds 


In [14]:
result = ""
new_sample = random_sample["Last Name"]
counts = dict()


start_time = time.time()
for each_entry in new_sample:
    doc = nlp(each_entry)
    
    for ent in doc.ents:
        #print(ent.text, "|", ent.label_)
        counts[ent.label_] = counts.get(ent.label_, 0) + 1
    #dict(sorted(counts.items(), key=lambda item: item[1]))
    #print(counts)
    
counts2 = sorted(counts.items(), key=lambda item: item[1],reverse=True)
sortdict = dict(counts2)
#print(sortdict)

# get first key value pair from the dictionary
first_pair = next(iter((sortdict.items())))
if first_pair[1]>=(len(sortdict)*0.7):
    print('First Key: ', first_pair[0])
    print('First Value: ', first_pair[1])
    print("Time to detection: %s seconds " % (time.time() - start_time))

First Key:  ORG
First Value:  243
Time to detection: 3.456655979156494 seconds 
