In [71]:
import pandas as pd
import numpy as np
import re
import phonenumbers

In [250]:
class PIIFilter:
    def identify_email(self, text):
        regex = "^\S+@\S+\.\S+$"
        email_check=[]
        for each_entry in text: 
            #print(each_entry)
            email = re.findall(regex,str(each_entry))
            if email:
                email_check.append(each_entry)
        #email_check
        if len(email_check)>=(len(text)*0.7):
            return " contains PII (email)."
        #return "This may not contain PII."
        return ""
    
    def identify_ssn(self,text):
        regex = "^(?!666|000|9\\d{2})\\d{3}-(?!00)\\d{2}-(?!0{4})\\d{4}$"
        ssn_check = []
        for each_entry in text:
            ssn = re.findall(regex,str(each_entry))
            if ssn:
                ssn_check.append(ssn)
        #print(ssn_check)
        # if 70% data belongs to this category, put a label
        #print(len(ssn_check))

        if len(ssn_check)>=(len(text)*0.7):
            return " contains PII (SSN)."
        return ""
    
    
    def identify_phone_numbers(self,text):
        phone_list = []
        regex = "^\s*(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})(?: *x(\d+))?\s*$"
        for each_entry in text:
            phone = re.findall(regex,str(each_entry))
            if phone:
                phone_list.append(each_entry)
            if len(phone_list)>=(len(text)*0.7):
                return " contains PII (phone number)."
        #return "This may not contain PII."
        return ""
    
    def identify_postal_codes(self,text):
        postal_list = []
        regex = "^[0-9]{5}(?:-[0-9]{4})?$"
        for each_entry in text:
            postal = re.findall(regex,str(each_entry))
            if postal:
                postal_list.append(each_entry)
            if len(postal_list)>=(len(text)*0.7):
                return " contains PII (postal code)."
        return ""
    
    def identify_credit_card_number(self,text):
        cc_list = []
        regex = "(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)"
        for each_entry in text:
            each_entry = str(each_entry)
            each_entry = each_entry.replace("-","")
            cc = re.findall(regex,str(each_entry))
            if cc:
                cc_list.append(each_entry)
            if len(cc_list)>=(len(text)*0.7):
                return " contains PII (credit card number)."
        return ""
    
    def identify_drivers_license(self,text):
        drivers_list = []
        regex = "^[A-Z](?:\d[- ]*){14}$"
        for each_entry in text:
            driver = re.findall(regex,str(each_entry))
            if driver:
                drivers_list.append(each_entry)
            if len(drivers_list)>=(len(text)*0.7):
                return " contains PII (Driving License number)."
        return ""
    
    def filterPii(self, inputtext):
        analysis_text = ""
        analysis_text += self.identify_email(inputtext)
        analysis_text += self.identify_ssn(inputtext)
        analysis_text += self.identify_phone_numbers(inputtext)
        analysis_text += self.identify_postal_codes(inputtext)
        analysis_text += self.identify_credit_card_number(inputtext)
        analysis_text += self.identify_drivers_license(inputtext)
        
        #if analysis_text == "":
            #print("This may not contain PII.")
            #return
        return analysis_text
        


## Load Datasets

In [2]:
patients_data = pd.read_csv ('Patients_clean.csv')
provider_data = pd.read_csv ('provider_clean.csv')
sale_data = pd.read_csv ('sale_clean.csv')
dummy_data = pd.read_csv ('dummydata_clean.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### We would be using sales data  

In [212]:
sale_data["cust_id"].head

<bound method NDFrame.head of 0          60124
1          60124
2          60124
3          60124
4          60124
           ...  
286387    115323
286388    115324
286389    115325
286390    115325
286391    115326
Name: cust_id, Length: 286392, dtype: int64>

In [135]:
random_sample = sale_data['SSN'].sample(n=1000, random_state = 101)
a.filterPii(random_sample)

In [136]:
?random_sample

In [209]:
a = PIIFilter()

In [210]:
columns = list(sale_data)
random_sample = sale_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = a.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
    else:
        print("The column " + column + str(result))

The column order_id may not contain PII.
The column order_date may not contain PII.
The column status may not contain PII.
The column item_id may not contain PII.
The column sku may not contain PII.
The column qty_ordered may not contain PII.
The column price may not contain PII.
The column value may not contain PII.
The column discount_amount may not contain PII.
The column total may not contain PII.
The column category may not contain PII.
The column payment_method may not contain PII.
The column bi_st may not contain PII.
The column cust_id contains PII (postal code).
The column year may not contain PII.
The column month may not contain PII.
The column ref_num may not contain PII.
The column Name Prefix may not contain PII.
The column First Name may not contain PII.
The column Middle Initial may not contain PII.
The column Last Name may not contain PII.
The column Gender may not contain PII.
The column age may not contain PII.
The column full_name may not contain PII.
The column E M

## Dummy dataset 

In [227]:
b = PIIFilter()

In [228]:
columns = list(dummy_data)
random_sample = dummy_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = b.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
    else:
        print("The column " + column + str(result))

The column Full_Name may not contain PII.
The column SSN contains PII (SSN).
The column Credit_Card_Number contains PII (credit card number).


## patients dataset

In [229]:
patients_data.head

<bound method NDFrame.head of                                           Id   BIRTHDATE          SSN  \
0       1ff7f10f-a204-4bb1-aa72-dd763fa99482   8/24/2017  999-68-6630   
1       9bcf6ed5-d808-44af-98a0-7d78a29ede72    8/1/2016  999-15-5895   
2       5163c501-353c-4a82-b863-a3f1df2d6cf1    1/9/2004  999-73-2461   
3       cc3c806f-4a09-4a89-a990-4286450956be  11/15/1996  999-60-7372   
4       bd1c4ffc-7f1d-4590-adbb-1d6533fb623e   6/12/2019  999-81-4349   
...                                      ...         ...          ...   
124145  1ecfda69-7afc-4417-8a6f-c00be1be96dc  12/26/1934  999-42-9136   
124146  8cf835a7-f161-4fe9-a559-350c97a3450e   9/15/1944  999-62-9011   
124147  503d768f-481c-46e2-bcdb-a6116686351a  12/26/1934  999-19-6698   
124148  2599e9d9-ca59-44ec-a28c-9eae219f162d  12/26/1934  999-89-4620   
124149  f8d85cff-037c-4313-9448-14ac57d586a8  12/26/1934  999-45-7864   

          DRIVERS    PASSPORT      FIRST       LAST MARITAL   RACE  \
0             NaN      

In [231]:
pd.isna(patients_data).sum()

Id                         0
BIRTHDATE                  0
SSN                        0
DRIVERS                19686
PASSPORT               25386
FIRST                      0
LAST                       0
MARITAL                36121
RACE                       0
ETHNICITY                  0
GENDER                     0
BIRTHPLACE                 0
ADDRESS                    0
CITY                       0
STATE                      0
COUNTY                     0
ZIP                    58322
LAT                        0
LON                        0
HEALTHCARE_EXPENSES        0
HEALTHCARE_COVERAGE        0
full_name                  0
dtype: int64

### discuss with team. The cleaned data has blank values for drivers, passport, zip which we have considered PII.

In [251]:
c = PIIFilter()

In [252]:
columns = list(patients_data)
random_sample = patients_data.sample(n=1000, random_state = 121)

for column in columns:
    result = ""
    new_sample = random_sample[column]
    #?new_sample
    result = c.filterPii(new_sample)
    if result == "": 
        print("The column " + column + " may not contain PII.")
    else:
        print("The column " + column + str(result))

The column Id may not contain PII.
The column BIRTHDATE may not contain PII.
The column SSN may not contain PII.
The column DRIVERS may not contain PII.
The column PASSPORT may not contain PII.
The column FIRST may not contain PII.
The column LAST may not contain PII.
The column MARITAL may not contain PII.
The column RACE may not contain PII.
The column ETHNICITY may not contain PII.
The column GENDER may not contain PII.
The column BIRTHPLACE may not contain PII.
The column ADDRESS may not contain PII.
The column CITY may not contain PII.
The column STATE may not contain PII.
The column COUNTY may not contain PII.
The column ZIP may not contain PII.
The column LAT may not contain PII.
The column LON may not contain PII.
The column HEALTHCARE_EXPENSES may not contain PII.
The column HEALTHCARE_COVERAGE may not contain PII.
The column full_name may not contain PII.


In [243]:
patients_data.head

<bound method NDFrame.head of                                           Id   BIRTHDATE          SSN  \
0       1ff7f10f-a204-4bb1-aa72-dd763fa99482   8/24/2017  999-68-6630   
1       9bcf6ed5-d808-44af-98a0-7d78a29ede72    8/1/2016  999-15-5895   
2       5163c501-353c-4a82-b863-a3f1df2d6cf1    1/9/2004  999-73-2461   
3       cc3c806f-4a09-4a89-a990-4286450956be  11/15/1996  999-60-7372   
4       bd1c4ffc-7f1d-4590-adbb-1d6533fb623e   6/12/2019  999-81-4349   
...                                      ...         ...          ...   
124145  1ecfda69-7afc-4417-8a6f-c00be1be96dc  12/26/1934  999-42-9136   
124146  8cf835a7-f161-4fe9-a559-350c97a3450e   9/15/1944  999-62-9011   
124147  503d768f-481c-46e2-bcdb-a6116686351a  12/26/1934  999-19-6698   
124148  2599e9d9-ca59-44ec-a28c-9eae219f162d  12/26/1934  999-89-4620   
124149  f8d85cff-037c-4313-9448-14ac57d586a8  12/26/1934  999-45-7864   

          DRIVERS    PASSPORT      FIRST       LAST MARITAL   RACE  \
0             NaN      