<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/docs/examples/node_postprocessor/PII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PII Masking

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main.zip -d /content/drive/MyDrive/KUBIG/NLP/

Archive:  /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main.zip
9d8a7cbb87729be502479a549fc0656b4e510425
   creating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/
   creating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/BERT_train/
  inflating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/BERT_train/bert_train_pipeline.ipynb  
  inflating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/BERT_train/compute_f1score.py  
  inflating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/README.md  
   creating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/data/
  inflating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/data/Define.txt  
  inflating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/format-construction.ipynb  
   creating: /content/drive/MyDrive/KUBIG/NLP/Korean-health-Prompt-NER-main/openai-access/
   creating: /content/drive/MyDrive

In [7]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Load the tokenizer and the pre-trained model
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertForTokenClassification.from_pretrained('monologg/kobert', num_labels=15)

# Define the labels based on the 7 categories
label_map = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-EDU',
    6: 'I-EDU',
    7: 'B-AFF',
    8: 'I-AFF',
    9: 'B-POS',
    10: 'I-POS',
    11: 'B-LOC',
    12: 'I-LOC',
    13: 'B-DUR',
    14: 'I-DUR'
}

# Function to predict and tag BIO labels automatically
def predict_ner(text):
    # Tokenize the input sentence
    tokens = tokenizer(text, return_tensors="pt", is_split_into_words=True, truncation=True, padding=True)
    # Get predictions from the model
    output = model(**tokens)
    # Find the highest probability predictions
    predictions = torch.argmax(output.logits, dim=2)
    # Map predictions to labels
    predicted_tags = [label_map[p.item()] for p in predictions[0]]
    return list(zip(text, predicted_tags))

# Example sentences for prediction
new_sentence = ["코리아텍에서", "보안팀", "주임으로", "일하던", "중", "다른", "부서", "차장님께서", "다급히", "연락이", "왔던", "적이", "있었습니다."]
new_sentence2 = ["따라서", "디자인팀에서", "먼저", "웹사이트", "메인", "배너", "디자인", "교체", "필요성을", "주장했습니다."]

# Predict and print the labels
print(predict_ner(new_sentence))
print(predict_ner(new_sentence2))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[('코리아텍에서', 'I-LOC'), ('보안팀', 'I-PER'), ('주임으로', 'B-AFF'), ('일하던', 'B-AFF'), ('중', 'B-AFF'), ('다른', 'B-AFF'), ('부서', 'B-AFF'), ('차장님께서', 'B-AFF'), ('다급히', 'O'), ('연락이', 'B-AFF'), ('왔던', 'B-AFF'), ('적이', 'I-LOC'), ('있었습니다.', 'I-LOC')]
[('따라서', 'I-LOC'), ('디자인팀에서', 'I-LOC'), ('먼저', 'O'), ('웹사이트', 'O'), ('메인', 'B-AFF'), ('배너', 'B-AFF'), ('디자인', 'B-AFF'), ('교체', 'B-AFF'), ('필요성을', 'O'), ('주장했습니다.', 'I-LOC')]


In [9]:
from transformers import BertTokenizer, BertForTokenClassification
import torch
from torch.nn import functional as F

# Load the tokenizer and model for KPF-bert-ner
tokenizer = BertTokenizer.from_pretrained('KPF/KPF-bert-ner')
model = BertForTokenClassification.from_pretrained('KPF/KPF-bert-ner')

# Input sentences
sentences = [
    "코리아텍에서 보안팀 주임으로 일하던 중 다른 부서 차장님께서 다급히 연락이 왔던 적이 있었습니다.",
    "안녕하십니까. 한양대학교 김석사입니다. 저는 경기도 안산시 상록구에 거주하고 있으며 2003년부터 동원F&B 영업부 대리로 근무했던 경력이 있습니다."
]

# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, is_split_into_words=False)

# Perform model inference
with torch.no_grad():
    outputs = model(**inputs).logits

# Get predicted token labels
predictions = torch.argmax(F.softmax(outputs, dim=2), dim=2)

# Check the label map from the KPF/KPF-bert-ner model (this might be included in the model's config or documentation)
# Assuming these are the labels defined by the KPF/KPF-bert-ner model
label_map = model.config.id2label

# Convert tokens and predictions to BIO tags
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
bio_tags = [label_map[pred.item()] for pred in predictions[0]]

# Display the results
for token, tag in zip(tokens, bio_tags):
    if token not in ["[CLS]", "[SEP]", "[PAD]"]:
        print(f"{token}: {tag}")

# Example output for visualization
for sentence, token_list, tag_list in zip(sentences, inputs['input_ids'], predictions):
    print("\nOriginal Sentence: ", sentence)
    print("Tokens and Tags:")
    for token, pred in zip(token_list, tag_list):
        token_text = tokenizer.decode([token])
        tag = label_map[pred.item()]
        if token_text not in ["[CLS]", "[SEP]", "[PAD]"]:
            print(f"{token_text} [{tag}]")


tokenizer_config.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/850k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/14.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/455M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


코리아: LABEL_82
##텍: LABEL_232
##에서: LABEL_299
보안: LABEL_299
##팀: LABEL_299
주임: LABEL_35
##으로: LABEL_299
일: LABEL_299
##하: LABEL_299
##던: LABEL_299
중: LABEL_299
다른: LABEL_299
부서: LABEL_299
차장: LABEL_35
##님: LABEL_299
##께: LABEL_299
##서: LABEL_299
다급: LABEL_299
##히: LABEL_299
연락: LABEL_299
##이: LABEL_299
왔: LABEL_299
##던: LABEL_299
적: LABEL_299
##이: LABEL_299
있: LABEL_299
##었: LABEL_299
##습: LABEL_299
##니다: LABEL_299
.: LABEL_299

Original Sentence:  코리아텍에서 보안팀 주임으로 일하던 중 다른 부서 차장님께서 다급히 연락이 왔던 적이 있었습니다.
Tokens and Tags:
코리아 [LABEL_82]
##텍 [LABEL_232]
##에서 [LABEL_299]
보안 [LABEL_299]
##팀 [LABEL_299]
주임 [LABEL_35]
##으로 [LABEL_299]
일 [LABEL_299]
##하 [LABEL_299]
##던 [LABEL_299]
중 [LABEL_299]
다른 [LABEL_299]
부서 [LABEL_299]
차장 [LABEL_35]
##님 [LABEL_299]
##께 [LABEL_299]
##서 [LABEL_299]
다급 [LABEL_299]
##히 [LABEL_299]
연락 [LABEL_299]
##이 [LABEL_299]
왔 [LABEL_299]
##던 [LABEL_299]
적 [LABEL_299]
##이 [LABEL_299]
있 [LABEL_299]
##었 [LABEL_299]
##습 [LABEL_299]
##니다 [LABEL_299]
. [LABEL_299]

Original Sente

In [12]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# Load the pre-trained model
model_name = "KPF/KPF-bert-ner"  # Replace this with the correct model name
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

# Create a pipeline for NER
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Input sentence
sentence = "여야 원내대표가 16일 오후 김진표 국회의장 주재로 다시 얼굴을 맞대고 내년도 예산안 협상을 이어갔지만 기존 입장만 되풀이하며 진전을 보지 못했다.이날 회동은 전날 김 의장이 내놓은 중재안을 국민의힘이 받아들이지 않으면서, 예산안 협상이 또 불발된 이후 첫 만남이었다.양당 원내대표는 이날도 서로에게 양보를 요구하며 지루한 대치 국면을 이어갔다.국민의힘 주호영 원내대표는 예산안 처리 법정 기한과 정기국회 기간이 도과한 지 꽤 됐는데도 불구하고 내년도 예산안을 합의 처리 못 해 국민께 죄송하다며 입을 열었다. 이어 헌법이나 법률에도 예산 편성과 운영에는 정부에 주도권을 주고 있다며 정부가 위기의 순간에 빠르게, 계획대로 재정 운용을 집행할 수 있게 협조해 달라고 민주당에 간곡히 부탁드린다고 말했다."

# Get the NER results
ner_results = ner_pipeline(sentence)

# Print the results
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: 여야, Label: LABEL_299
Entity: 원내대표, Label: LABEL_35
Entity: ##가, Label: LABEL_299
Entity: 16, Label: LABEL_43
Entity: ##일, Label: LABEL_193
Entity: 오후, Label: LABEL_123
Entity: 김진표, Label: LABEL_96
Entity: 국회, Label: LABEL_35
Entity: ##의, Label: LABEL_185
Entity: ##장, Label: LABEL_185
Entity: 주재, Label: LABEL_299
Entity: ##로, Label: LABEL_299
Entity: 다시, Label: LABEL_299
Entity: 얼굴, Label: LABEL_299
Entity: ##을, Label: LABEL_299
Entity: 맞대, Label: LABEL_299
Entity: ##고, Label: LABEL_299
Entity: 내년도, Label: LABEL_299
Entity: 예산안, Label: LABEL_299
Entity: 협상, Label: LABEL_299
Entity: ##을, Label: LABEL_299
Entity: 이어, Label: LABEL_299
Entity: ##갔, Label: LABEL_299
Entity: ##지만, Label: LABEL_299
Entity: 기존, Label: LABEL_299
Entity: 입장, Label: LABEL_299
Entity: ##만, Label: LABEL_299
Entity: 되풀이, Label: LABEL_299
Entity: ##하, Label: LABEL_299
Entity: ##며, Label: LABEL_299
Entity: 진전, Label: LABEL_299
Entity: ##을, Label: LABEL_299
Entity: 보, Label: LABEL_299
Entity: ##지, Label: LABEL_2

If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙.

In [None]:
%pip install llama-index-llms-openai
%pip install llama-index-llms-huggingface

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.1.29-py3-none-any.whl.metadata (650 bytes)
Collecting llama-index-core<0.11.0,>=0.10.57 (from llama-index-llms-openai)
  Downloading llama_index_core-0.10.64-py3-none-any.whl.metadata (2.5 kB)
Collecting openai<2.0.0,>=1.40.0 (from llama-index-llms-openai)
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.57->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.57->llama-index-llms-openai)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11.0,>=0.10.57->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting httpx (from llama-index-core<0.11.0,>=0.10.57->llama-index-llms-openai)
  Downloading httpx-0.27.

In [None]:
!pip install llama-index

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core.postprocessor import (
    PIINodePostprocessor,
    NERPIINodePostprocessor,
)
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.schema import TextNode

In [None]:
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
1111-0000-1111-0000 was mailed to 123 Any Street, Seattle, WA 98109.
"""
node = TextNode(text=text)

### Option 1: Use NER Model for PII Masking

Use a Hugging Face NER model for PII Masking

In [19]:
processor = NERPIINodePostprocessor()

In [20]:
from llama_index.core.schema import NodeWithScore

new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [21]:
# view redacted text
new_nodes[0].node.get_text()

'Hello [ORG_7]. The latest statement for your credit card account 1111-0000-1111-0000 was mailed to 123 [ORG_109] [LOC_113], [LOC_121], [LOC_130] 98109.'

In [22]:
# get mapping in metadata
# NOTE: this is not sent to the LLM!
new_nodes[0].node.metadata["__pii_node_info__"]

{'[ORG_7]': 'Paulo Santos',
 '[ORG_109]': 'Any',
 '[LOC_113]': 'Street',
 '[LOC_121]': 'Seattle',
 '[LOC_130]': 'WA'}

### Option 2: Use LLM for PII Masking

NOTE: You should be using a *local* LLM model for PII masking. The example shown is using OpenAI, but normally you'd use an LLM running locally, possibly from huggingface. Examples for local LLMs are [here](https://gpt-index.readthedocs.io/en/latest/how_to/customization/custom_llms.html#example-using-a-huggingface-llm).

In [52]:
from llama_index.llms.huggingface import HuggingFaceLLM

# Zephyr 모델을 사용하여 HuggingFaceLLM 인스턴스 생성
locally_run = HuggingFaceLLM(model_name="HuggingFaceH4/zephyr-7b-alpha")

# 로컬 LLM을 사용하여 PIINodePostprocessor 생성
processor = PIINodePostprocessor(llm=locally_run)

config.json:   0%|          | 0.00/628 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



Some parameters are on the meta device device because they were offloaded to the disk and cpu.
Some parameters are on the meta device device because they were offloaded to the disk and cpu.
Some parameters are on the meta device device because they were offloaded to the disk and cpu.


tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The model `HuggingFaceH4/zephyr-7b-alpha` and tokenizer `StabilityAI/stablelm-tuned-alpha-3b` are different, please ensure that they are compatible.
The model `HuggingFaceH4/zephyr-7b-alpha` and tokenizer `StabilityAI/stablelm-tuned-alpha-3b` are different, please ensure that they are compatible.
The model `HuggingFaceH4/zephyr-7b-alpha` and tokenizer `StabilityAI/stablelm-tuned-alpha-3b` are different, please ensure that they are compatible.


AttributeError: 'str' object has no attribute 'node'

In [53]:
from llama_index.core.schema import NodeWithScore
new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# view redacted text
new_nodes[0].node.get_text()

'Hello [NAME]. The latest statement for your credit card account [CREDIT_CARD_NUMBER] was mailed to [ADDRESS].'

In [None]:
# get mapping in metadata
# NOTE: this is not sent to the LLM!
new_nodes[0].node.metadata["__pii_node_info__"]

{'NAME': 'Paulo Santos',
 'CREDIT_CARD_NUMBER': '1111-0000-1111-0000',
 'ADDRESS': '123 Any Street, Seattle, WA 98109'}

### Option 3: Use Presidio for PII Masking

Use presidio to identify and anonymize PII

In [None]:
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
4095-2609-9393-4932 was mailed to Seattle, WA 98109. \
IBAN GB90YNTU67299444055881 and social security number is 474-49-7577 were verified on the system. \
Further communications will be sent to paulo@presidio.site
"""
presidio_node = TextNode(text=text)

In [None]:
from llama_index.postprocessor.presidio import PresidioPIINodePostprocessor

processor = PresidioPIINodePostprocessor()

In [None]:
from llama_index.core.schema import NodeWithScore

presidio_new_nodes = processor.postprocess_nodes(
    [NodeWithScore(node=presidio_node)]
)

In [None]:
# view redacted text
presidio_new_nodes[0].node.get_text()

In [None]:
# get mapping in metadata
# NOTE: this is not sent to the LLM!
presidio_new_nodes[0].node.metadata["__pii_node_info__"]

### Feed Nodes to Index

In [None]:
# feed into index
index = VectorStoreIndex([n.node for n in new_nodes])

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 30 tokens
> [build_index_from_nodes] Total embedding token usage: 30 tokens


In [None]:
response = index.as_query_engine().query(
    "What address was the statement mailed to?"
)
print(str(response))

INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens
> [retrieve] Total embedding token usage: 8 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 71 tokens
> [get_response] Total LLM token usage: 71 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens

[ADDRESS]
