In [1]:
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import json
from pprint import pprint
from presidio_helpers import (
    get_supported_entities,
    analyze,
    anonymize,
    analyzer_engine,
)

In [72]:
text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"
st_model_package = "spaCy"
st_model = "en_core_web_lg"
st_threshold = .3

#read in docx
from docx import Document
with open("worddoctest.docx", "rb") as f:
    demo_text = Document(f)
    demo_text = [p.text for p in demo_text.paragraphs]


In [73]:
final_text = "\n".join(demo_text)
print(final_text)

Here are a few example sentences we currently support:

Hello, my name is Lilly Grella and I live in North carolina.
My credit card number is 2840-1285-1243-1345 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

On September 18 I visited microsoft.com and sent an email to tlmgrella13@gmail.com,  from the IP 192.168.0.1.

My passport: 123445678 and my phone number: (757) 532-1139.

This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?

KC’s social security number is 112-33-4455.  Her driver license? it is 1234567A.


In [29]:
analyzer_params = (st_model_package, st_model)
st_entities = list(get_supported_entities(*analyzer_params))
print(st_entities)  # Print the supported entities to check if they are correct
default_entities = ['IN_PASSPORT', 'FACILITY', 'MEDICAL_LICENSE', 'US_SSN', 'UK_NINO', 'EMAIL_ADDRESS', 'PERSON', 'IN_AADHAAR', 'IP_ADDRESS', 'CRYPTO', 'NRP', 'IN_VOTER', 'DATE_TIME', 'AU_ACN', 'US_PASSPORT', 'UK_NHS', 'PHONE_NUMBER', 'IBAN_CODE', 'URL', 'IN_VEHICLE_REGISTRATION', 'AU_TFN', 'US_BANK_NUMBER', 'SG_NRIC_FIN', 'IN_PAN', 'AU_MEDICARE', 'US_DRIVER_LICENSE', 'CREDIT_CARD', 'US_ITIN', 'AU_ABN', 'LOCATION']

['IN_PASSPORT', 'FACILITY', 'MEDICAL_LICENSE', 'US_SSN', 'UK_NINO', 'EMAIL_ADDRESS', 'PERSON', 'IN_AADHAAR', 'NRP', 'CRYPTO', 'IP_ADDRESS', 'IN_VOTER', 'DATE_TIME', 'AU_ACN', 'US_PASSPORT', 'UK_NHS', 'PHONE_NUMBER', 'IBAN_CODE', 'URL', 'IN_VEHICLE_REGISTRATION', 'AU_TFN', 'US_BANK_NUMBER', 'SG_NRIC_FIN', 'IN_PAN', 'AU_MEDICARE', 'US_DRIVER_LICENSE', 'CREDIT_CARD', 'US_ITIN', 'AU_ABN', 'LOCATION']


In [74]:
analyzer = analyzer_engine(st_model_package, st_model)
st_entities=list(get_supported_entities(*analyzer_params)),
st_analyze_results = analyze(st_model_package, st_model, text=final_text,
                             entities=default_entities,
                             language="en",
                             score_threshold=st_threshold,
                             return_decision_process=True,
    #allow_list=st_allow_list,
    #deny_list=st_deny_list,
    )

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [69]:
print(st_analyze_results)

[type: CRYPTO, start: 189, end: 223, score: 1.0, type: EMAIL_ADDRESS, start: 287, end: 308, score: 1.0, type: IBAN_CODE, start: 449, end: 472, score: 1.0, type: IP_ADDRESS, start: 323, end: 334, score: 0.95, type: LOCATION, start: 101, end: 115, score: 0.85, type: DATE_TIME, start: 229, end: 241, score: 0.85, type: DATE_TIME, start: 316, end: 334, score: 0.85, type: DATE_TIME, start: 350, end: 359, score: 0.85, type: LOCATION, start: 538, end: 540, score: 0.85, type: US_SSN, start: 569, end: 580, score: 0.85, type: PHONE_NUMBER, start: 381, end: 395, score: 0.75, type: US_DRIVER_LICENSE, start: 609, end: 617, score: 0.6499999999999999, type: URL, start: 252, end: 265, score: 0.5, type: URL, start: 299, end: 308, score: 0.5, type: US_PASSPORT, start: 350, end: 359, score: 0.4, type: US_BANK_NUMBER, start: 523, end: 535, score: 0.4]


In [70]:
st_anonymize_results = anonymize(
    text=final_text,
    operator="replace",
    mask_char="-",
    number_of_chars=20,
    encrypt_key=1234567,
    analyze_results=st_analyze_results,
)
#keep only the anonymized text

In [71]:
print(st_anonymize_results.text)

Here are a few example sentences we currently support:

Hello, my name is Lilly Grella and I live in <LOCATION>.
My credit card number is 2840-1285-1243-1345 and my crypto wallet id is <CRYPTO>.

On <DATE_TIME> I visited <URL> and sent an email to <EMAIL_ADDRESS>,  from <DATE_TIME>.

My passport: <DATE_TIME> and my phone number: <PHONE_NUMBER>.

This is a valid International Bank Account Number: <IBAN_CODE> . Can you please check the status on bank account <US_BANK_NUMBER>?

<LOCATION>’s social security number is <US_SSN>.  Her driver license? it is <US_DRIVER_LICENSE>.


In [33]:
import pandas as pd
return_decision_process = True
if st_analyze_results:
    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
    df["text"] = [final_text[res.start : res.end] for res in st_analyze_results]

    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
        {
            "entity_type": "Entity type",
            "text": "Text",
            "start": "Start",
            "end": "End",
            "score": "Confidence",
        },
        axis=1,
    )
    df_subset["Text"] = [final_text[res.start : res.end] for res in st_analyze_results]
    if return_decision_process:
        analysis_explanation_df = pd.DataFrame.from_records(
            [r.analysis_explanation.to_dict() for r in st_analyze_results]
        )
        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
    print(df_subset)

          Entity type                                Text  Start  End  \
0              CRYPTO  16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ    189  223   
1       EMAIL_ADDRESS               tlmgrella13@gmail.com    287  308   
2           IBAN_CODE             IL150120690000003111111    449  472   
3          IP_ADDRESS                         192.168.0.1    323  334   
4            LOCATION                      North carolina    101  115   
5           DATE_TIME                        September 18    229  241   
6           DATE_TIME                  the IP 192.168.0.1    316  334   
7           DATE_TIME                           123445678    350  359   
8            LOCATION                                  KC    538  540   
9              US_SSN                         112-33-4455    569  580   
10       PHONE_NUMBER                      (757) 532-1139    381  395   
11  US_DRIVER_LICENSE                            1234567A    609  617   
12                URL                       microso

In [66]:
from docx import Document

# Function to write anonymized content to a new docx file
def write_to_docx(anonymized_text, output_file_path):
    doc = Document()
    doc.add_paragraph(st_anonymize_results.text)  # Add anonymized text as a paragraph
    doc.save(output_file_path)  # Save the document

# Assuming st_anonymize_results contains the anonymized text (as a string)
# For example, if anonymized text is a string like this:
anonymized_text = st_anonymize_results

# Now write this anonymized text to a new .docx file
write_to_docx(anonymized_text, "anonymized_output.docx")

print("Anonymized text has been written to anonymized_output.docx")

#output results to csv
df_subset.to_csv("results.csv")


Anonymized text has been written to anonymized_output.docx
