In [1]:
import pandas as pd
import os

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'data-loss-prevention-test-74b082472d34.json'
print('Credentials from environ: {}'.format(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')))

Credentials from environ: data-loss-prevention-test-74b082472d34.json


In [3]:
#Name, E-mail, Street Address, Location, Phone Number
info_types = ['PERSON_NAME', 'EMAIL_ADDRESS','STREET_ADDRESS', 
                   'LOCATION', 'PHONE_NUMBER']

In [4]:
def deidentify_with_mask(
    project, input_str, info_types, masking_character=None, number_to_mask=0
):
    """Uses the Data Loss Prevention API to deidentify sensitive data in a
    string by masking it with a character.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        input_str: The string to deidentify (will be treated as text).
        masking_character: The character to mask matching sensitive data with.
        number_to_mask: The maximum number of sensitive characters to mask in
            a match. If omitted or set to zero, the API will default to no
            maximum.
    Returns:
        None; the response from the API is printed to the terminal.
    """

    # Import the client library
    import google.cloud.dlp

    # Instantiate a client
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Convert the project id into a full resource id.
    parent = dlp.project_path(project)

    # Construct inspect configuration dictionary
    inspect_config = {
        "info_types": [{"name": info_type} for info_type in info_types]
    }

    # Construct deidentify configuration dictionary
    deidentify_config = {
        "info_type_transformations": {
            "transformations": [
                {
                    "primitive_transformation": {
                        "character_mask_config": {
                            "masking_character": masking_character,
                            "number_to_mask": number_to_mask,
                        }
                    }
                }
            ]
        }
    }

    # Construct item
    item = {"value": input_str}

    # Call the API
    response = dlp.deidentify_content(
        parent,
        inspect_config=inspect_config,
        deidentify_config=deidentify_config,
        item=item,
    )
    
    

    # Print out the results.
    print(response.item.value)
    
    return response.item.value


In [5]:
deidentify_with_mask(project='data-loss-prevention-test',
                     input_str="Hello, my name is Stefan, Phone number is 111-223-4333, address is 405 Lexington Ave",
                     info_types=info_types)



Hello, my name is ******, Phone number is ************, address is *****************


'Hello, my name is ******, Phone number is ************, address is *****************'

In [14]:
df = pd.read_csv('fake_pii_data.csv')

In [15]:
df.head(5)

Unnamed: 0,names,address,email,phone,Unmasked_Text
0,Maisie,4650 CUSHING PARKWAY,pede@lacusCras.org,(671) 348-5817,My name is Maisie and my email is pede@lacusCr...
1,Rylee,44201 NOBEL DRIVE,risus@facilisis.net,(970) 853-8816,My name is Rylee and my email is risus@facilis...
2,Felix,5130 HACIENDA DRIVE,in@Maurisut.co.uk,(311) 434-8134,My name is Felix and my email is in@Maurisut.c...
3,Hiroko,345 COURT STREET,ullamcorper@sapien.ca,(713) 744-6780,My name is Hiroko and my email is ullamcorper@...
4,Dominique,600 GRANT STREET,nibh.dolor@penatibusetmagnis.co.uk,(470) 400-5847,My name is Dominique and my email is nibh.dolo...


In [16]:
df['Unmasked_Text'][0]

'My name is Maisie and my email is pede@lacusCras.org. My phone # is (671) 348-5817. My address is 4650 CUSHING PARKWAY'

In [17]:
df.tail(5)

Unnamed: 0,names,address,email,phone,Unmasked_Text
56,Chadwick,2702 LOVE FIELD DRIVE,sollicitudin@Aliquamvulputateullamcorper.net,(551) 961-8934,"yes, Chadwick is correct. Email is sollicitudi..."
57,Colette,12500 TI BOULEVARD,commodo.tincidunt@atlibero.net,(683) 413-0096,"yes, Colette is correct. Email is commodo.tinc..."
58,Kirby,100 MISSION RIDGE,amet.ante.Vivamus@fringillacursuspurus.org,(302) 878-0911,"yes, Kirby is correct. Email is amet.ante.Viva..."
59,Melvin,259 NORTH RADNOR-CHESTER ROAD,magna.Lorem.ipsum@dignissimlacusAliquam.ca,(153) 111-0581,"yes, Melvin is correct. Email is magna.Lorem.i..."
60,Barbara,1300 MORRIS DRIVE,sed@auctorullamcorpernisl.co.uk,(262) 173-7314,"yes, Barbara is correct. Email is sed@auctorul..."


In [18]:
df['Unmasked_Text'][85]

KeyError: 85

In [11]:
df['Unmasked_Text'][0]

'My name is Maisie and my email is pede@lacusCras.org. My phone # is (671) 348-5817. My address is 4650 CUSHING PARKWAY'

In [12]:
# Run deidentify_with_mask function on dataset

In [13]:
df['Content_masked'] = df.apply(lambda row: deidentify_with_mask(project='data-loss-prevention-test',
                                                                 input_str=row['Unmasked_Text'],
                                                                info_types=info_types) ,axis=1)

My name is ****** and my email is ******************. My phone # is **************. My address is ***************************
My name is ***** and my email is *******************. My phone # is **************. My address is *****************
My name is ***** and my email is *****************. My phone # is **************. My address is *******************
My name is ****** and my email is *********************. My phone # is **************. My address is ****************
My name is ********* and my email is **********************************. My phone # is **************. My address is ****************
My name is ****** and my email is *************************. My phone # is **************. My address is *****************************
My name is ****** and my email is ************************. My phone # is **************. My address is **************
My name is ****** and my email is ******************************. My phone # is **************. My address is 1 *********
My name is ***

hello, how are you? E-mail and phone numbers are: *********************************** and **************. my name is ****. Address is: **********************
yes, ******** is correct. Email is ******************************************** and phone number is: **************. Physical address is: *********************
yes, ******* is correct. Email is ****************************** and phone number is: **************. Physical address is: ******************
yes, ***** is correct. Email is ****************************************** and phone number is: **************. Physical address is: *****************
yes, ****** is correct. Email is ****************************************** and phone number is: **************. Physical address is: ************************************
yes, ******* is correct. Email is ********************************* and phone number is: **************. Physical address is: ***********************


In [None]:
df.to_csv('fake_pii_data_masked.csv')