In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import re
import spacy
import logging

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")

# Indian-specific entity patterns
AADHAAR_PATTERN = r'\b\d{4}\s?\d{4}\s?\d{4}\b'
PAN_PATTERN = r'\b[A-Z]{5}\d{4}[A-Z]\b'
ATM_CARD_PATTERN = r'\b\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\b'
DRIVING_LICENSE_PATTERN = r'\b[A-Z]{2}\d{2}\s?\d{4}\s?\d{4}\b'
PASSPORT_PATTERN = r'\b[A-Z]{1}\d{7}\b'
VOTER_ID_PATTERN = r'\b[A-Z]{3}\d{7}\b'

# Entity levels for redaction
ENTITY_LEVELS = {
    1: ['PERSON', 'DATE', 'GPE', 'EMAIL', 'PHONE', 'LOC', 'ADDRESS', 'AGE', 'GENDER', 'TITLE'],
    2: ['CARDINAL', 'MONEY', 'PERCENT', 'FAC', 'ORG', 'PRODUCT', 'TIME', 'QUANTITY'],
    3: ['IDENTIFIER', 'AADHAAR_NUMBER', 'PAN_NUMBER', 'ATM_CARD_NUMBER', 'DRIVING_LICENSE_NUMBER', 'PASSPORT_NUMBER', 'VOTER_ID', 'TITLE', 'OCCUPATION'],
}

# Define a dictionary to map entity labels to redaction strings
ENTITY_MAP = {
    'PERSON': '[REDACTED NAME]',
    'ORG': '[REDACTED ORG]',
    'GPE': '[REDACTED LOCATION]',
    'DATE': '[REDACTED DATE]',
    'EMAIL': '[REDACTED EMAIL]',
    'PHONE': '[REDACTED PHONE]',
    'LOC': '[REDACTED LOC]',
    'ADDRESS': '[REDACTED ADDRESS]',
    'AGE': '[REDACTED AGE]',
    'GENDER': '[REDACTED GENDER]',
    'TITLE': '[REDACTED TITLE]',
    'IDENTIFIER': '[REDACTED IDENTIFIER]',
    'AADHAAR_NUMBER': 'xxx-xxx-xxx',
    'PAN_NUMBER': lambda x: f"XXXXX{x.group()[5:9]}X",
    'ATM_CARD_NUMBER': 'xxxx-xxxx-xxxx-xxxx',
    'DRIVING_LICENSE_NUMBER': 'DL-XXXX XXXX XXXX',
    'PASSPORT_NUMBER': 'PXXXXXXX',
    'VOTER_ID': 'VID-XXXXXXX',
}

app = FastAPI()

class RedactionRequest(BaseModel):
    text: str
    level: int
    custom_placeholders: dict = None  # Allow custom placeholders

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def redact_entities(text: str, entities: list, placeholders: dict = None) -> str:
    """
    Redact entities in the text based on the provided entity list.
    """
    redacted_text = text
    placeholders = placeholders or ENTITY_MAP  # Use custom placeholders if provided
    entities = sorted(entities, key=lambda e: e.start_char)  # Sort by position
    for entity in entities:
        replacement = placeholders.get(entity.label_, '[REDACTED]')
        redacted_text = re.sub(re.escape(entity.text), replacement, redacted_text)
    return redacted_text

def redact_indian_entities(text: str, level: int, custom_placeholders: dict = None) -> str:
    """
    Redact Indian-specific entities in the text based on the provided level.
    """
    if level not in ENTITY_LEVELS:
        logging.error(f"Invalid redaction level: {level}")
        raise HTTPException(status_code=400, detail="Invalid redaction level")

    logging.info(f"Redacting text at level {level}")
    doc = nlp(text)
    
    # Determine entities to redact based on the level
    entities_to_redact = []
    for lvl in range(1, level + 1):
        entities_to_redact.extend([ent for ent in doc.ents if ent.label_ in ENTITY_LEVELS[lvl]])

    # Redact using spaCy-detected entities
    redacted_text = redact_entities(text, entities_to_redact, placeholders=custom_placeholders)

    # Apply regex patterns for specific Indian identifiers if level is 3
    if level == 3:
        patterns = [
            (AADHAAR_PATTERN, custom_placeholders.get('AADHAAR_NUMBER', ENTITY_MAP['AADHAAR_NUMBER'])),
            (PAN_PATTERN, custom_placeholders.get('PAN_NUMBER', ENTITY_MAP['PAN_NUMBER'])),
            (ATM_CARD_PATTERN, custom_placeholders.get('ATM_CARD_NUMBER', ENTITY_MAP['ATM_CARD_NUMBER'])),
            (DRIVING_LICENSE_PATTERN, custom_placeholders.get('DRIVING_LICENSE_NUMBER', ENTITY_MAP['DRIVING_LICENSE_NUMBER'])),
            (PASSPORT_PATTERN, custom_placeholders.get('PASSPORT_NUMBER', ENTITY_MAP['PASSPORT_NUMBER'])),
            (VOTER_ID_PATTERN, custom_placeholders.get('VOTER_ID', ENTITY_MAP['VOTER_ID']))
        ]
        for pattern, replacement in patterns:
            redacted_text = re.sub(pattern, replacement, redacted_text)

    return redacted_text

@app.post("/redact")
def redact(request: RedactionRequest):
    try:
        logging.info(f"Received redaction request with level {request.level}")
        redacted_text = redact_indian_entities(request.text, request.level, custom_placeholders=request.custom_placeholders)
        return {"redacted_text": redacted_text}
    except HTTPException as e:
        logging.error(f"Error occurred: {e.detail}")
        raise e
