# Presidio PoC

## Installation

In [24]:
#!pip install presidio_analyzer
#!pip install presidio_anonymizer
#!python -m spacy download en_core_web_lg

In [25]:
import pandas as pd
import random

from typing import List, Optional, Dict, Union, Iterator, Iterable
import collections
from dataclasses import dataclass
import pprint

from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
from presidio_anonymizer.entities import EngineResult

In [None]:
# Load data 
test_data = pd.read_excel('path/1-MB-Test.xlsx', sheet_name='Sheet1', header=0)
test_data = test_data.drop([0])
test_data.head()

Unnamed: 0,First and Last Name,SSN,Credit Card Number
1,Robert Aragon,489-36-8350,4929-3813-3266-4295
2,Ashley Borden,514-14-8905,5370-4638-8881-3020
3,Thomas Conley,690-05-5315,4916-4811-5814-8111
4,Susan Davis,421-37-1396,4916-4034-9269-8783
5,Christopher Diaz,458-02-6124,5299-1561-5689-1938


In [27]:

# Preprocess data:
def preprocess_data(data):
    data = data.dropna()
    data = data.drop_duplicates()

    # Add ID column
    data['ID'] = range(1, len(data) + 1)

    # Add store column randomly from a list of stores
    stores = ['Eaton Toronto', 'YUL Airport', 'Nordstrom NY', 'Yorkdale', 'E-comm']
    data['Store'] = random.choices(stores, k=len(data))

    # Add random date
    data['Date'] = pd.date_range(start='1/1/2024', periods=len(data))

    # Add random amount of shoes bought
    data['Amount'] = [random.randint(-2, 20) for i in range(len(data))]

    # Add random price
    data['Price'] = [random.randint(10, 200) for i in range(len(data))]

    # Add total
    data['Total'] = data['Amount'] * data['Price']

    data = data.reset_index(drop=True)
    return data

test_data = preprocess_data(test_data).head()
test_data.head()

Unnamed: 0,First and Last Name,SSN,Credit Card Number,ID,Store,Date,Amount,Price,Total
0,Robert Aragon,489-36-8350,4929-3813-3266-4295,1,YUL Airport,2024-01-01,2,54,108
1,Ashley Borden,514-14-8905,5370-4638-8881-3020,2,YUL Airport,2024-01-02,5,185,925
2,Thomas Conley,690-05-5315,4916-4811-5814-8111,3,E-comm,2024-01-03,10,105,1050
3,Susan Davis,421-37-1396,4916-4034-9269-8783,4,Eaton Toronto,2024-01-04,6,129,774
4,Christopher Diaz,458-02-6124,5299-1561-5689-1938,5,E-comm,2024-01-05,19,126,2394


### Expected Outcomes
* First and last names get anonymized
* SSN is anonymized
* Credit card is anonymized

In [28]:
# DataFrame to dict
df_dict = test_data.to_dict(orient="list")
df_dict

{'First and Last Name': ['Robert\xa0Aragon',
  'Ashley\xa0Borden',
  'Thomas\xa0Conley',
  'Susan\xa0Davis',
  'Christopher\xa0Diaz'],
 'SSN': ['489-36-8350',
  '514-14-8905',
  '690-05-5315',
  '421-37-1396',
  '458-02-6124'],
 'Credit Card Number': ['4929-3813-3266-4295',
  '5370-4638-8881-3020',
  '4916-4811-5814-8111',
  '4916-4034-9269-8783',
  '5299-1561-5689-1938'],
 'ID': [1, 2, 3, 4, 5],
 'Store': ['YUL Airport', 'YUL Airport', 'E-comm', 'Eaton Toronto', 'E-comm'],
 'Date': [Timestamp('2024-01-01 00:00:00'),
  Timestamp('2024-01-02 00:00:00'),
  Timestamp('2024-01-03 00:00:00'),
  Timestamp('2024-01-04 00:00:00'),
  Timestamp('2024-01-05 00:00:00')],
 'Amount': [2, 5, 10, 6, 19],
 'Price': [54, 185, 105, 129, 126],
 'Total': [108, 925, 1050, 774, 2394]}

In [29]:
analyzer = AnalyzerEngine()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
batch_anonymizer = BatchAnonymizerEngine()

keys_to_skip=["Store", "Date", "Amount Price", "Total"]

# Analyze the filtered data
analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en", keys_to_skip=keys_to_skip)
analyzer_results = list(analyzer_results)
analyzer_results

[DictAnalyzerResult(key='First and Last Name', value=['Robert\xa0Aragon', 'Ashley\xa0Borden', 'Thomas\xa0Conley', 'Susan\xa0Davis', 'Christopher\xa0Diaz'], recognizer_results=[[type: PERSON, start: 0, end: 13, score: 0.85], [type: PERSON, start: 0, end: 13, score: 0.85], [type: PERSON, start: 0, end: 13, score: 0.85], [type: PERSON, start: 0, end: 11, score: 0.85], [type: PERSON, start: 0, end: 16, score: 0.85]]),
 DictAnalyzerResult(key='SSN', value=['489-36-8350', '514-14-8905', '690-05-5315', '421-37-1396', '458-02-6124'], recognizer_results=[[type: US_SSN, start: 0, end: 11, score: 0.85], [type: US_SSN, start: 0, end: 11, score: 0.85], [type: US_SSN, start: 0, end: 11, score: 0.85], [type: US_SSN, start: 0, end: 11, score: 0.85], [type: US_SSN, start: 0, end: 11, score: 0.85]]),
 DictAnalyzerResult(key='Credit Card Number', value=['4929-3813-3266-4295', '5370-4638-8881-3020', '4916-4811-5814-8111', '4916-4034-9269-8783', '5299-1561-5689-1938'], recognizer_results=[[type: CREDIT_CAR

In [30]:
anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)
scrubbed_df = pd.DataFrame(anonymizer_results)
scrubbed_df.head()

Unnamed: 0,First and Last Name,SSN,Credit Card Number,ID,Store,Date,Amount,Price,Total
0,<PERSON>,<US_SSN>,<CREDIT_CARD>,1,YUL Airport,2024-01-01,2,54,108
1,<PERSON>,<US_SSN>,<CREDIT_CARD>,2,YUL Airport,2024-01-02,5,185,925
2,<PERSON>,<US_SSN>,<CREDIT_CARD>,3,E-comm,2024-01-03,10,105,1050
3,<PERSON>,<US_SSN>,<CREDIT_CARD>,4,Eaton Toronto,2024-01-04,6,129,774
4,<PERSON>,<US_SSN>,<CREDIT_CARD>,5,E-comm,2024-01-05,19,126,2394
