# Named Entity Recognition (NER) with Transformers (Encoder)

#### Install Required Libraries

In [None]:
%pip install --quiet faker
%pip install --quiet scikit-learn
%pip install --quiet transformers[torch]
%pip install --quiet seqeval
%pip install --quiet tensorflow
%pip install --quiet tf-keras
%pip install --quiet torch --quiet
%pip install --quiet datasets --quiet
%pip install --quiet evaluate --quiet

In [None]:
PROJECT_ID = !gcloud config list --format 'value(core.project)'
PROJECT_ID = PROJECT_ID[0]
REGION = "us-central1"
%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}
BUCKET_NAME=f'dataflow_demo_{PROJECT_ID}'

In [None]:
# Use the exact model you plan to train (bert-base, roberta, etc.)
MODEL_NAME = "google-bert/bert-base-multilingual-cased"
FINETUNED_MODEL_PATH = "./ner_finetuned_v2"
DATASET = "./train_bert_ner_1k.txt"
gcs_bucket = "gs://bert-finetuning-ner-demo"

In [None]:
import os
from google.cloud import aiplatform

aiplatform.init(
    project=os.getenv("PROJECT_ID"),
    location=os.getenv("LOCATION"),
)

#### Utility methods

In [None]:
from google.cloud import storage
import os

def upload_to_gcs(model_dir, bucket_name):
    """
    Uploads a fine-tuned model directory to Google Cloud Storage (GCS).
    """
    bucket_name = bucket_name.replace("gs://", "")  # Remove gs:// prefix if present
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    for root, _, files in os.walk(model_dir):
        for file in files:
            local_path = os.path.join(root, file)
            gcs_path = os.path.relpath(local_path, model_dir)
            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_path)
            print(f"Uploaded {local_path} to gs://{bucket_name}/{gcs_path}")

In [None]:
import random
import re
import json
from faker import Faker
from transformers import AutoTokenizer

# 1. Initialize Faker and YOUR specific Tokenizer
fake = Faker()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

ENTITY_TYPES = {
    "NAME": fake.name,
    "ADDRESS": lambda: fake.address().replace('\n', ', '),
    "EMAIL": fake.email,
    "PHONE_NUMBER": fake.phone_number,
}

# Define templates using a simplified syntax
# Define sentence templates with placeholders
# The script will randomly select one of these structures
#Loading file with generated templates containing placeholders for sensetive data
templates_file = "./generated_templates_v2.json"
with open(templates_file, 'r') as infile:
    TEMPLATES = json.load(infile)

def generate_sentence_with_spans():
    """
    Generates a sentence and returns:
    1. The full raw text.
    2. A list of entities with their exact character start/end positions.
    """
    template = random.choice(TEMPLATES)
    
    # Find all placeholders like {NAME}
    placeholders = list(re.finditer(r'\{(.*?)\}', template))
    
    current_text = ""
    last_pos = 0
    entities = [] # Stores {'type': 'NAME', 'start': 5, 'end': 15}
    
    # Build the string piece by piece to track indices
    for match in placeholders:
        # Add the text BEFORE the entity
        pre_text = template[last_pos:match.start()]
        current_text += pre_text
        
        # Generate the entity value
        entity_type = match.group(1)
        entity_value = ENTITY_TYPES[entity_type]()
        
        # Record the start index of the entity
        start_index = len(current_text)
        
        # Add the entity value
        current_text += entity_value
        
        # Record the end index
        end_index = len(current_text)
        
        # Store metadata
        entities.append({
            "type": entity_type,
            "start": start_index,
            "end": end_index,
            "value": entity_value
        })
        
        last_pos = match.end()
        
    # Add any remaining text after the last entity
    current_text += template[last_pos:]
    
    return current_text, entities

def generate_bert_train_data(num_sentences, output_file):
    
    with open(output_file, "w", encoding="utf-8") as f:
        
        for _ in range(num_sentences):
            # 1. Generate Raw Text & Spans
            text, entity_spans = generate_sentence_with_spans()
            
            # 2. Tokenize with Offset Mapping
            # return_offsets_mapping=True gives us (start_char, end_char) for every token
            encoding = tokenizer(text, return_offsets_mapping=True, truncation=True)
            
            tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
            offsets = encoding["offset_mapping"]
            
            # 3. Align Labels
            # We skip [CLS] (index 0) and [SEP] (last index) for the file output
            # though usually we keep them for internal arrays. 
            # For CoNLL format, we usually want just the words.
            
            lines_to_write = []
            
            for i in range(1, len(tokens) - 1): # Skip [CLS] and [SEP]
                token = tokens[i]
                start_char, end_char = offsets[i]
                
                # Default Label
                label = "O"
                
                # Check if this token falls inside any generated entity
                for ent in entity_spans:
                    # Overlap logic: 
                    # If the token's start/end lies within the entity's start/end
                    if start_char >= ent["start"] and end_char <= ent["end"]:
                        
                        # Determine B or I
                        # If this token starts at the exact beginning of the entity...
                        if start_char == ent["start"]:
                            label = f"B-{ent['type']}"
                        else:
                            label = f"I-{ent['type']}"
                        break # Found the entity, stop checking
                
                lines_to_write.append(f"{token} {label}")
            
            # 4. Write to file
            for line in lines_to_write:
                f.write(line + "\n")
                #print(line) # Preview
            
            f.write("\n")

# --- Execution ---
print("Generating Training Data...\n")
generate_bert_train_data(num_sentences=1000, output_file=DATASET)
print("--- Generation Complete ---")

### Finetune BERT for NER

In [None]:
import json
#import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
import evaluate
from sklearn.model_selection import train_test_split

# Load generated train dataset
with open(DATASET, "r", encoding="utf-8") as f:
    raw_data = f.readlines()

# Parse dataset into sentences and labels
def process_conll_data(raw_lines):
    sentences, labels = [], []
    current_sent, current_lab = [], []

    for line in raw_lines:
        parts = line.split() # Splits on any whitespace; returns [] if line is empty
        
        if not parts:
            if current_sent: # Only append if we have accumulated data
                sentences.append(current_sent)
                labels.append(current_lab)
                current_sent, current_lab = [], []
        else:
            current_sent.append(parts[0])
            current_lab.append(parts[1])

    # Flush the last buffer if the file didn't end with a newline
    if current_sent:
        sentences.append(current_sent)
        labels.append(current_lab)
        
    return sentences, labels

sentences, labels = process_conll_data(raw_data)

# Create label mapping
unique_labels = ['O',
                 'B-ADDRESS',
                 'I-ADDRESS',
                 'B-PHONE_NUMBER',
                 'I-PHONE_NUMBER',
                 'B-NAME',
                 'I-NAME',
                 'B-EMAIL',
                 'I-EMAIL']

label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Base BERT model
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# Tokenize dataset
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(sentences, truncation=True, padding=True, is_split_into_words=True)
    aligned_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])  # First subword gets the label
            else:
                label_ids.append(-100)  # Other subwords get -100

            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Split dataset into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Convert into Hugging Face dataset format
train_data = Dataset.from_dict(tokenize_and_align_labels(train_texts, train_labels))
test_data = Dataset.from_dict(tokenize_and_align_labels(test_texts, test_labels))

dataset = DatasetDict({"train": train_data, "test": test_data})

# Count the number of unique labels from your dataset
print(f"labels:[{unique_labels}]")
num_labels = len(unique_labels)

# Load the model with the correct label mappings
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable all logging integrations
)

# Compute metrics
# Load metric using evaluate library
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label] for label in label_seq if label != -100]
        for label_seq in labels
    ]
    pred_labels = [
        [id2label[pred] for pred, lbl in zip(pred_seq, label_seq) if lbl != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]

    # Compute metrics with zero_division handling
    results = metric.compute(predictions=pred_labels, references=true_labels, zero_division=0)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

# Train model
trainer.train()

# Evaluate model
trainer.evaluate()

# Saving finetuned model
model_directory = FINETUNED_MODEL_PATH
trainer.save_model(model_directory)
tokenizer.save_pretrained(model_directory)

#Upload finetuned model to Google Storage
upload_to_gcs(model_directory, gcs_bucket)

In [None]:
# Quick test for predictions
#import torch
from transformers import BertTokenizerFast, BertForTokenClassification, pipeline

# Load the fine-tuned model and tokenizer
model_path = "./ner_finetuned_v2"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)

#print(model)

# Create a Named Entity Recognition (NER, technically "Token Classification") pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")

# Function to predict entities in text
def predict_entities(text):
    predictions = ner_pipeline(text)
    return predictions

# Example input text
text = "Send the invoice to 123 Main Street, New York. Contact John Connnor at johnconnor@example.com"

# Generate predictions
predictions = predict_entities(text)
print(predictions)
# Print the results
print("Raw predictions:")
for entity in predictions:
    print("-"*80)
    print(f"""Entity: {entity['word']}, 
          Type: {entity['entity_group']}, 
          Confidence: {entity['score']:.4f}, 
          Position: ({entity['start']}, {entity['end']})""")
    print("-"*80)

def mask_sensetive_data(text, predictions):
    prediction_sorted = sorted(predictions, key=lambda x: x['start'])
    merged = []
    redacted_text = text

    for mask in predictions: #
        text_to_mask = text[int(mask['start']): int(mask['end'])]
        if len(text_to_mask)>0:
            redacted_text = redacted_text.replace(text_to_mask, f"[{mask['entity_group']}]")
    return {"text": text, "redacted_text": redacted_text}

# Load test data
input_texts = [
    "My phone number is 001-863-838-7300x0830.",
    "My name is Jessica Williams.",
    "My name is Gabriel Ryan.",
    "My address is 150 Cortez Station Apt. 561, South Amberburgh, OH 44484.",
    "The meeting with Maria Garcia from Google will be held at their office located at 48 Pirrama Road, Pyrmont NSW 2009."
]

# # Run inference
results = [mask_sensetive_data(text, predict_entities(text)) for text in input_texts]

print("\n\n output for test records:")
print("-------------------")

for result in results:
    print(result)

In [None]:
####
# Inference pipeline V1 based on Vertex AI online Endpoint
####
#Setting Google Cloud env variables
%env PROJECT_ID=oleksandr-demo
%env LOCATION=us-central1
%env CONTAINER_URI=us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-pytorch-inference-cu121.2-2.transformers.4-44.ubuntu2204.py311

In [None]:
!gcloud config set project $PROJECT_ID

In [None]:
#Uploading model to Vertex AI models
model = aiplatform.Model.upload(
    display_name="bert-ner-demo-v2",
    serving_container_image_uri=os.getenv("CONTAINER_URI"),
    artifact_uri="gs://bert-finetuning-ner-demo",
    serving_container_environment_variables={
        "HF_MODEL_DIR": "/tmp/models",
        "HF_TASK": "token-classification",
    },
)
model.wait()

In [None]:
#Creating Vertex AI Endpoint
endpoint = aiplatform.Endpoint.create(display_name="bert-ner-demo-endpoint-v3")

In [None]:
#Deploying model to Vertex AI Endpoint
deployed_model = model.deploy(
    endpoint=endpoint,
    machine_type="n2-standard-4",
)

In [None]:
print(endpoint.display_name)
print(endpoint.name)

In [None]:
display_name="bert-ner-demo-endpoint-v3"
# List endpoints with a filter
# We use the filter string to search specifically for the display_name
endpoints = aiplatform.Endpoint.list(
    filter=f'display_name="{display_name}"',
    order_by="create_time desc" # Optional: Get the most recently created one if duplicates exist
)
deployed_model = endpoints[0]

In [None]:
output = deployed_model.predict(instances=[
    "Send the invoice to 123 Main Street, New York. Contact John Connnor at johnconnor@example.com.",
    "The meeting with Maria Garcia will be held at their office located at 48 Pirrama Road, Pyrmont NSW 2009."
    ])

print("RAW Predictions")
print("-"*80)
print(output)
print("-"*80)

import re

def _get_core_type(pred):
    """
    Helper to extract 'PHONE' from 'B-PHONE', 'I-PHONE', or just 'PHONE'
    Handles keys 'entity', 'entity_group', or 'label'.
    """
    label = pred.get('entity_group') or pred.get('entity') or pred.get('label')
    if not label: return "UNKNOWN"

    # Strip B- or I- prefixes
    if "-" in label and (label.startswith("B-") or label.startswith("I-")):
        return label.split("-", 1)[1]
    return label

def postprocess_predictions(predictions, merge_distance=1):
    """
    Merges adjacent entities of the same type.
    
    Args:
        predictions: List of dicts returned by the pipeline.
        merge_distance: Max characters allowed between entities to merge them.
                        0 = must be touching (e.g., "123" + "-")
                        1 = allows spaces (e.g., "John" + " " + "Doe")
    """
    if not predictions:
        return []

    # 1. Sort by start index to ensure processing order
    sorted_preds = sorted(predictions, key=lambda x: x['start'])
    
    merged = []
    
    # Initialize the first entity
    # We strip 'B-' or 'I-' to compare the core type (e.g., "PHONE")
    first_pred = sorted_preds[0]
    current_group = {
        "entity_group": _get_core_type(first_pred),
        "score": first_pred['score'],
        "word": first_pred['word'],
        "start": first_pred['start'],
        "end": first_pred['end']
    }

    for next_pred in sorted_preds[1:]:
        next_type = _get_core_type(next_pred)
        
        # Calculate gap between current end and next start
        gap = int(next_pred['start'] - current_group['end'])
        
        # MERGE CONDITION:
        # 1. Same Entity Type (e.g. PHONE == PHONE)
        # 2. Adjacent or close enough (gap <= threshold)
        if next_type == current_group['entity_group'] and gap <= merge_distance:
            
            # Update End Position
            current_group['end'] = next_pred['end']
            
            # Merge text safely
            # If there is a space in the original text (gap > 0), add it back
            # Also handle BERT subwords (remove '##' if present)
            sep = " " * gap # Reconstruct space if gap exists
            clean_word = next_pred['word'].replace("##", "")
            
            # If the previous word ended with a subword marker (rare but possible), handle it
            # But usually we just append
            current_group['word'] += sep + clean_word
            
            # Update Score: You can take Max or Average
            current_group['score'] = max(current_group['score'], next_pred['score'])
            
        else:
            # NO MERGE: Push current and start new
            merged.append(current_group)
            
            current_group = {
                "entity_group": next_type,
                "score": next_pred['score'],
                "word": next_pred['word'],
                "start": next_pred['start'],
                "end": next_pred['end']
            }

    # Append the final group
    merged.append(current_group)
    
    # Final cleanup of words (in case the very first word had ##)
    for m in merged:
        m['word'] = m['word'].replace("##", "")
        
    return merged
print("Postprocessed Predictions")
print("-"*80)
for predictions in output.predictions:
    print(postprocess_predictions(predictions))
print("-"*80)

In [None]:
# deployed_model.undeploy_all()
# deployed_model.delete()
# model.delete()

Copyright 2025 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.