In [1]:


import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re

In [2]:
model_name = "iiiorg/piiranha-v1-detect-personal-information"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [3]:
custom_patterns = {
    # Business-specific patterns
    'SHIPMENT_NUMBER': r'(SHIP|SHP)-\d{2}-\d{5}',           # SHIP-23-12345
    'ORDER_NUMBER': r'ORD-\d{2}-[A-Z]{2}-\d{6}',           # ORD-23-IN-123456
    'INVOICE_NUMBER': r'INV-\d{4}-[A-Z]{2}\d{6}',          # INV-2024-IN123456
    'TRACKING_NUMBER': r'TRK-[A-Z]{2}\d{10}',              # TRK-IN1234567890
    'CONTAINER_NUMBER': r'CONT-[A-Z]{4}\d{7}',             # CONT-ABCD1234567
    'PRODUCT_CODE': r'PRD-[A-Z]{2}-\d{6}',                 # PRD-EL-123456
    
    # Employee information
    'EMPLOYEE_ID': r'EMP-[A-Z]{2}-\d{6}',                  # EMP-IT-123456
    'BADGE_NUMBER': r'BDG-\d{6}',                          # BDG-123456
    
    # Financial information
    'ACCOUNT_NUMBER': r'ACC-\d{12}',                       # ACC-123456789012
    'TRANSACTION_ID': r'TXN-[A-Z0-9]{12}',                # TXN-AB12CD34EF56
    'CREDIT_CARD': r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}',  # 1234-5678-9012-3456
    
    # Contact information
    'PHONE_INDIA': r'(\+91|0)?[6789]\d{9}',               # +919876543210
    'EMAIL': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
    'PASSPORT': r'[A-Z]{1}[0-9]{7}',                      # A1234567
    
    # Location information
    'GPS_COORDS': r'-?\d{1,2}\.\d{6},\s*-?\d{1,3}\.\d{6}',  # 12.123456, 77.123456
    'PINCODE': r'\b\d{6}\b',                              # 123456
    
    # Vehicle information
    'LICENSE_PLATE': r'[A-Z]{2}\d{2}[A-Z]{2}\d{4}',      # TN01AB1234
    'CHASSIS_NUMBER': r'[A-Z0-9]{17}',                    # ABCD12345EFGH6789
    
    # Custom business metrics
    'BATCH_NUMBER': r'BTH-\d{2}-[A-Z]{2}-\d{4}',         # BTH-23-IN-1234
    'LOT_NUMBER': r'LOT-[A-Z]{2}\d{6}',                  # LOT-AB123456
    'WAREHOUSE_ID': r'WH-[A-Z]{2}-\d{3}'                 # WH-IN-001
}

In [4]:
text = """
Employee Dhanushkumar (EMP-IT-123456) processed shipment SHIP-23-12345 for order ORD-23-IN-123456.
Customer can be reached at +919876543210 or danushidk507@gmail.com.
Delivery address: 123 Main St, Chennai 600001
Vehicle TN01AB1234 will deliver container CONT-ABCD1234567.
Transaction TXN-AB12CD34EF56 was processed for INV-2024-IN123456.
GPS Location: 13.123456, 80.123456
Product PRD-EL-123456 from batch BTH-23-IN-1234 is ready for pickup at WH-IN-001.
Credit card: 4321-8765-1234-5678
Passport number: A1234567
"""

In [5]:
masked_text = text
for pattern_type, pattern in custom_patterns.items():
    matches = list(re.finditer(pattern, masked_text))
    for match in reversed(matches):
        start, end = match.span()
        replacement = f'[{pattern_type}]'  
        masked_text = masked_text[:start] + replacement + masked_text[end:]

In [6]:
masked_text

'\nEmployee Dhanushkumar ([EMPLOYEE_ID]) processed shipment [SHIPMENT_NUMBER] for order [ORDER_NUMBER].\nCustomer can be reached at [PHONE_INDIA] or [EMAIL].\nDelivery address: 123 Main St, Chennai [PINCODE]\nVehicle [LICENSE_PLATE] will deliver container [CONTAINER_NUMBER].\nTransaction [TRANSACTION_ID] was processed for [INVOICE_NUMBER].\nGPS Location: [GPS_COORDS]\nProduct [PRODUCT_CODE] from batch [BATCH_NUMBER] is ready for pickup at [WAREHOUSE_ID].\nCredit card: [CREDIT_CARD]\nPassport number: [PASSPORT]\n'

In [8]:
inputs = tokenizer(masked_text, return_tensors="pt", truncation=True, padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
inputs

{'input_ids': tensor([[     1,    260, 116655, 121761,  73175,  60253,    260,  29495, 152995,
          204971,  19327,    291,   3025,  51178,    260,  86735,  71711,   1144,
             492,  80563,  29919,    291,    539, 160835,    440,    333,   5412,
             492,  90155,    291,    539, 160835,  10071,  28750,    739,    391,
             260,  11126,    346,    345,    492, 107409,    291, 123848,    440,
             632,    492,  42391,  10071,  37928,  14131,    268,  19177,   4438,
            1820,    262,  58777,    492,  87158,  58358,    440,  70148,    492,
           83198, 104986,    291,    560, 120895,    440,    899,  15849,    260,
            5895,    492, 187671, 118440,    291,    539, 160835,  10071,    260,
          117693,    492, 178788, 170003,    291,   3025,    440,    640,    260,
           86735,    333,    492,   9390,    562, 210480,    291,    539, 160835,
           10071,  16551,  19614,    268,    492,  82634,    291, 110710, 126909,
   

In [10]:
with torch.no_grad():
    outputs = model(**inputs)

In [12]:
predictions = torch.argmax(outputs.logits, dim=-1)
encoded_inputs = tokenizer.encode_plus(masked_text, return_offsets_mapping=True, add_special_tokens=True)
offset_mapping = encoded_inputs['offset_mapping']

In [11]:
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[-1.3687, -0.3770, -1.8071,  ..., -0.1210, -1.5397,  5.1386],
         [-1.1841, -2.3140, -1.4726,  ..., -0.9898, -2.9343, 13.4018],
         [-1.1459, -2.4127, -1.5615,  ..., -1.2652, -2.9635, 13.4350],
         ...,
         [-1.2477, -1.6528, -1.3613,  ..., -1.1085, -2.6407, 13.4708],
         [-1.0217, -1.2990, -1.7829,  ..., -1.1369, -2.8491, 13.5439],
         [-1.1137, -0.0215, -2.0703,  ..., -0.0444, -1.6419,  8.2437]]]), hidden_states=None, attentions=None)

In [15]:
final_text = list(masked_text)
is_redacting = False
redaction_start = 0
current_pii_type = ''


In [16]:
for i, (start, end) in enumerate(offset_mapping):
    if start == end:  
        continue
    
    label = predictions[0][i].item()
    if label != model.config.label2id['O']:  # If it's a PII token
        pii_type = model.config.id2label[label]
        if not is_redacting:
            is_redacting = True
            redaction_start = start
            current_pii_type = pii_type
        elif pii_type != current_pii_type:
            for j in range(redaction_start, start):
                final_text[j] = ''
            final_text[redaction_start] = f'[{current_pii_type}]'  
            redaction_start = start
            current_pii_type = pii_type
    else:
        if is_redacting:
     
            for j in range(redaction_start, end):
                final_text[j] = ''
            final_text[redaction_start] = f'[{current_pii_type}]'  
            is_redacting = False

In [17]:
if is_redacting:
    for j in range(redaction_start, len(final_text)):
        final_text[j] = ''
    final_text[redaction_start] = f'[{current_pii_type}]'

In [18]:
final_masked_text = ''.join(final_text)

In [19]:
print("Original text:")
print(text)


Original text:

Employee Dhanushkumar (EMP-IT-123456) processed shipment SHIP-23-12345 for order ORD-23-IN-123456.
Customer can be reached at +919876543210 or danushidk507@gmail.com.
Delivery address: 123 Main St, Chennai 600001
Vehicle TN01AB1234 will deliver container CONT-ABCD1234567.
Transaction TXN-AB12CD34EF56 was processed for INV-2024-IN123456.
GPS Location: 13.123456, 80.123456
Product PRD-EL-123456 from batch BTH-23-IN-1234 is ready for pickup at WH-IN-001.
Credit card: 4321-8765-1234-5678
Passport number: A1234567



In [20]:
print("\nMasked text:")
print(final_masked_text)


Masked text:

Employee[I-GIVENNAME]([EMPLOYEE_ID]) processed shipment [SHIPMENT_NUMBER] for order [ORDER_NUMBER].
Customer can be reached at [PHONE_INDIA] or [EMAIL].
Delivery address:[I-BUILDINGNUM] St,[I-CITY]PINCODE]
Vehicle [LICENSE_PLATE] will deliver container [CONTAINER_NUMBER].
Transaction [TRANSACTION_ID] was processed for [INVOICE_NUMBER].
GPS Location: [GPS_COORDS]
Product [PRODUCT_CODE] from batch [BATCH_NUMBER] is ready for pickup at [WAREHOUSE_ID].
Credit card: [CREDIT_CARD]
Passport number: [PASSPORT]

