EXTRACTING SENSITIVE INFORMATION FROM TEXT

Importing all packages and functions we need:

In [4]:
import re
import spacy

# Load the English NLP model
nlp = spacy.load("en_core_web_sm")  # Use a small model for basic NER tasks


In [47]:
def find_matches(pattern, string, flags=0):
    # Compile RegEx pattern
    p = re.compile(pattern, flags=flags)
    # Match pattern against input text
    matches = list(p.finditer(string))
    # Handle matches
    if len(matches) == 0:
        return None
    else:
        return([m.group() for m in matches])

Detecting sensitive info:

In [54]:
# Define regex patterns
# phone_pattern = r'\b[689]\d{7}\b'  # Pattern for Singapore phone numbers
phone_pattern = r"\s(\+65)?[\s-]?\d{4}[\s-]?\d{4}\b"
nric_pattern = r'\b[SFTG]\d{7}[A-Z]\b'  # Pattern for Singapore NRIC/FIN
# Placeholder pattern for bank account numbers; adjust as necessary
bank_account_pattern = r'\b\d{10,12}\b'
email_pattern = r"\b[\w.-]+?@\w+?\.\w+?\b"  # Email address pattern

def detect_sensitive_info(text):
    # Use spaCy for NER
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    # Extract locations and organizations as potential address components
    possible_addresses = [ent.text for ent in doc.ents if ent.label_ in ["LOC", "GPE", "ORG", "FAC"]]

    # Use regex for custom pattern matching
    phone_numbers = find_matches(phone_pattern, text, flags=re.IGNORECASE)
    nric_numbers = re.findall(nric_pattern, text)
    bank_account_numbers = re.findall(bank_account_pattern, text)
    email_addresses = re.findall(email_pattern, text)

    # Compile results
    results = {
        "named_entities": named_entities,
        "possible_addresses": possible_addresses,
        "phone_numbers": phone_numbers,
        "nric_numbers": nric_numbers,
        "bank_account_numbers": bank_account_numbers,
        "email_addresses": email_addresses
    }

    return results

Printing the sensitive info in an organized way:

In [57]:
def print_formatted_info(info):
    print("Detected Sensitive Information:\n")
    
    # Iterate over the results dictionary
    for category, items in info.items():
        # Print the category name
        print(f"{category.replace('_', ' ').title()}:")
        
        if not items:  # Check if the list is empty
            print("  None found.\n")
            continue
        
        # Iterate over items in each category and print
        for item in items:
            # For named entities, 'item' is a tuple (text, label)
            if isinstance(item, tuple):
                print(f"  - {item[0]} ({item[1]})")
            else:  # For regex matches, 'item' is just the matched string
                print(f"  - {item}")
        
        print()  # Add an empty line for spacing

In [58]:
text = "John Doe's phone number is +65 81234567 and his NRIC is S1234567A. His bank account number is 123456789012."
text = """John Doe's email is john.doe@example.com, and his phone number is +65 1234 5678. His NRIC is S1234567A. 
His bank account number is 123456789012. He works at Acme Corp, located at 123 Orchard Road, Singapore."""
print_formatted_info(detect_sensitive_info(text))

Detected Sensitive Information:

Named Entities:
  - John Doe's (PERSON)
  - john.doe@example.com (PERSON)
  - 1234 5678 (DATE)
  - NRIC (ORG)
  - 123456789012 (DATE)
  - Acme Corp (ORG)
  - 123 Orchard Road (FAC)
  - Singapore (GPE)

Possible Addresses:
  - NRIC
  - Acme Corp
  - 123 Orchard Road
  - Singapore

Phone Numbers:
  -  +65 1234 5678

Nric Numbers:
  - S1234567A

Bank Account Numbers:
  - 123456789012

Email Addresses:
  - john.doe@example.com

