In [1]:
import spacy
import re

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Define redaction levels and sensitive entity types
ENTITY_LEVELS = {
    1: ["PERSON", "EMAIL"],
    2: ["PERSON", "EMAIL", "DATE", "AGE", "ORG"],
    3: ["PERSON", "EMAIL", "DATE", "AGE", "ORG", "CARDINAL", "GPE"]
}

def redact_text(text, level):
    # Process the text with spaCy NLP pipeline
    doc = nlp(text)
    
    # List of redacted words
    redacted_text = text

    # Redact based on entities
    for ent in doc.ents:
        if ent.label_ in ENTITY_LEVELS[level]:
            redacted_text = re.sub(ent.text, "xxx", redacted_text)

    return redacted_text

# Sample usage
text = "John Doe is 25 years old and lives in New York. His email is john.doe@example.com."
redaction_level = 2
output = redact_text(text, redaction_level)
print(output)  # Output should redact name, email, etc. based on the level


xxx is xxx and lives in New York. His email is xxx.


In [4]:
edacted_text = re.sub(PAN_PATTERN, lambda x: f"XXXXX{x.group()[5:9]}X", redacted_text)


Original Text:
John Doe's Aadhaar number is 1234 5678 9012 and PAN number is ABCDE1234F. He was born on 01/01/1990.

Redacted Text:
xxx Aadhaar number is xxx-xxx-xxxx and xxx number is XXXXX1234X. He was born on xxx.


In [7]:
import re
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define patterns for Indian-specific entities
AADHAAR_PATTERN = r'\b\d{4}\s\d{4}\s\d{4}\b'
PAN_PATTERN = r'\b[A-Z]{5}\d{4}[A-Z]\b'

# Entity levels for redaction
ENTITY_LEVELS = {
    1: ['PERSON', 'ORG', 'DATE'],  # Example level 1 redaction
    2: ['PERSON', 'ORG', 'DATE', 'AADHAAR_NUMBER', 'PAN_NUMBER'],  # Example level 2
}

def redact_indian_entities(text, level):
    # Process text with spaCy
    doc = nlp(text)

    # Redact standard entities
    redacted_text = text
    for ent in doc.ents:
        if ent.label_ in ENTITY_LEVELS[level]:
            # Use word boundaries to avoid affecting surrounding text
            redacted_text = re.sub(rf'\b{re.escape(ent.text)}\b', "xxx", redacted_text)

    # Redact Aadhaar and PAN numbers for level 2 and above
    if level >= 2:
        # Replace Aadhaar numbers with the pattern "xxx-xxx-xxx"
        redacted_text = re.sub(AADHAAR_PATTERN, "xxx-xxx-xxx", redacted_text)

        # Replace only the PAN number itself, keeping "PAN number" intact
        redacted_text = re.sub(PAN_PATTERN, lambda x: f"XXXXX{x.group()[5:9]}X", redacted_text)

    return redacted_text

# Sample usage
text = "John Doe's Aadhaar number is 1234 5678 9012 and PAN number is ABCDE1234F."
redaction_level = 2
output = redact_indian_entities(text, redaction_level)
print(output)


xxx Aadhaar number is xxx and xxx number is XXXXX1234X.


In [8]:
import re
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define patterns for Indian-specific entities
AADHAAR_PATTERN = r'\b\d{4}\s\d{4}\s\d{4}\b'
PAN_PATTERN = r'\b[A-Z]{5}\d{4}[A-Z]\b'

# Entity levels for redaction
ENTITY_LEVELS = {
    1: ['PERSON', 'ORG', 'DATE'],  # Example level 1 redaction
    2: ['PERSON', 'ORG', 'DATE', 'AADHAAR_NUMBER', 'PAN_NUMBER'],  # Example level 2
}

def redact_indian_entities(text, level):
    # Process text with spaCy
    doc = nlp(text)

    # Redact standard entities
    redacted_text = text
    for ent in doc.ents:
        if ent.label_ in ENTITY_LEVELS[level]:
            # Use word boundaries to avoid affecting surrounding text
            redacted_text = re.sub(rf'\b{re.escape(ent.text)}\b', "xxx", redacted_text)

    # Redact Aadhaar and PAN numbers for level 2 and above
    if level >= 2:
        # Replace Aadhaar numbers with the pattern "xxx-xxx-xxx"
        redacted_text = re.sub(AADHAAR_PATTERN, "xxx-xxx-xxx", redacted_text)

        # Replace only the PAN number itself, keeping "PAN number" intact
        redacted_text = re.sub(PAN_PATTERN, lambda x: f"XXXXX{x.group()[5:9]}X", redacted_text)

    return redacted_text

# Sample usage
text = "John Doe's Aadhaar number is 1234 5678 9012 and PAN number is ABCDE1234F."
redaction_level = 2
output = redact_indian_entities(text, redaction_level)
print(output)


xxx Aadhaar number is xxx and xxx number is XXXXX1234X.


In [9]:
import re
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define patterns for Indian-specific entities
AADHAAR_PATTERN = r'\b\d{4}\s\d{4}\s\d{4}\b'
PAN_PATTERN = r'\b[A-Z]{5}\d{4}[A-Z]\b'

# Entity levels for redaction
ENTITY_LEVELS = {
    1: ['PERSON', 'ORG'],  # Level 1: Redact only names and organizations
    2: ['PERSON', 'ORG', 'GPE', 'DATE'],  # Level 2: Redact names, organizations, locations, dates
    3: ['PERSON', 'ORG', 'GPE', 'DATE', 'AADHAAR_NUMBER', 'PAN_NUMBER'],  # Level 3: Redact everything including Aadhaar and PAN
}

def redact_indian_entities(text, level):
    # Process text with spaCy
    doc = nlp(text)

    # Redact entities based on level
    redacted_text = text
    for ent in doc.ents:
        if ent.label_ in ENTITY_LEVELS[level]:
            if ent.label_ == 'PERSON':
                redacted_text = re.sub(re.escape(ent.text), '[REDACTED NAME]', redacted_text)
            elif ent.label_ == 'ORG':
                redacted_text = re.sub(re.escape(ent.text), '[REDACTED ORG]', redacted_text)
            elif ent.label_ == 'GPE':
                redacted_text = re.sub(re.escape(ent.text), '[REDACTED LOCATION]', redacted_text)
            elif ent.label_ == 'DATE':
                redacted_text = re.sub(re.escape(ent.text), '[REDACTED DATE]', redacted_text)
    
    # Redact Aadhaar and PAN numbers for level 3
    if level >= 3:
        # Aadhaar numbers
        redacted_text = re.sub(AADHAAR_PATTERN, 'xxx-xxx-xxx', redacted_text)
        # PAN numbers (keep only part of the PAN number visible)
        redacted_text = re.sub(PAN_PATTERN, lambda x: f"XXXXX{x.group()[5:9]}X", redacted_text)

    return redacted_text

# Sample usage
text = "John Doe's Aadhaar number is 1234 5678 9012 and PAN number is ABCDE1234F."
redaction_level = 3
output = redact_indian_entities(text, redaction_level)
print(output)


[REDACTED NAME] Aadhaar number is [REDACTED DATE] and [REDACTED ORG] number is XXXXX1234X.
