# Dataflow NER Pipeline with BERT

#### Install Required Libraries

In [None]:
%pip install --quiet scikit-learn
%pip install --quiet transformers[torch]
%pip install --quiet seqeval
%pip install --quiet tensorflow
%pip install --quiet tf-keras
%pip install --quiet torch --quiet
%pip install --quiet datasets --quiet
%pip install --quiet evaluate --quiet

In [None]:
PROJECT_ID = !gcloud config list --format 'value(core.project)'
PROJECT_ID = PROJECT_ID[0]
REGION = "us-central1"
%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}
#BUCKET_NAME=f'dataflow_demo_{PROJECT_ID}'

In [None]:
# Use the exact model you plan to train (bert-base, roberta, etc.)
MODEL_NAME = "google-bert/bert-base-multilingual-cased"
DATASET = "./train_bert_ner_1k.txt"
OUTPUT_BUCKET_NAME=f"bert-finetuning-ner-{PROJECT_ID}"
gcs_bucket = f"gs://{OUTPUT_BUCKET_NAME}"

In [None]:
!gsutil mb -l {REGION} gs://{OUTPUT_BUCKET_NAME}

In [None]:
import os
from google.cloud import aiplatform

aiplatform.init(
    project=os.getenv("PROJECT_ID"),
    location=os.getenv("LOCATION"),
)

#### Utility methods

#### Inference pipeline based on Apache Beam
Can be orchestrated by using Cloud Composer or Vertex AI Pipelines

In [None]:
DATAFLOW_BUCKET=f"bert-ner-demo-io-storage-{PROJECT_ID}" 
OUTPUT_GCS_BUCKET = f"gs://{DATAFLOW_BUCKET}/output/"
TEMP_LOCATION=f"gs://{DATAFLOW_BUCKET}/temp/"
STAGING_LOCATION=f"gs://{DATAFLOW_BUCKET}/staging/"

In [None]:
!gsutil mb -l {REGION} gs://{DATAFLOW_BUCKET}

In [None]:
import json
import argparse
import re
import os
from apache_beam import Create
from typing import Tuple, Iterable
from google.cloud import storage
from apache_beam.ml.inference import RunInference
from apache_beam.ml.inference.base import PredictionResult, KeyedModelHandler, ModelHandler
from apache_beam.ml.inference.huggingface_inference import HuggingFacePipelineModelHandler, PipelineTask
from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam as beam

from transformers import BertTokenizerFast, BertForTokenClassification, pipeline

class BertNerPipelineHandler(ModelHandler):
    def __init__(self, model_path):
        self.model_path = model_path
        self.pipeline = None
        self.local_folder = "./tmp_ner_finetuned"

    def load_model(self):
        """Loads the Transformer pipeline model."""
        # Load the fine-tuned model and tokenizer
        if self.pipeline is None:
            client = storage.Client()
            bucket = client.bucket(self.model_path)
            blobs = bucket.list_blobs(prefix="")
            if not os.path.exists(self.local_folder):
                os.makedirs(self.local_folder )
            for blob in blobs:
                file_path = os.path.join(self.local_folder, os.path.basename(blob.name))
                blob.download_to_filename(file_path)
            tokenizer = BertTokenizerFast.from_pretrained(self.local_folder)
            model = BertForTokenClassification.from_pretrained(self.local_folder)
            self.pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")
        return self.pipeline

    def run_inference(self, batch, model, inference_args=None):
        """Runs inference on a batch of inputs."""
        predictions = model(batch)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]

    def get_num_bytes(self, batch):
        """Returns the estimated size of the batch."""
        return sum(len(str(x).encode('utf-8')) for x in batch)

model_handler = BertNerPipelineHandler(OUTPUT_BUCKET_NAME)

def mask_sensetive_data(predictions):
    text = predictions.example
    redacted_text = text

    for mask in predictions.inference:
        text_to_mask = text[int(mask['start']): int(mask['end'])]
        if len(text_to_mask) > 0:
            redacted_text = redacted_text.replace(text_to_mask, f"[{mask['entity_group']}]")
    return {"text": text, "redacted_text": redacted_text}

def extract_entity(predictions):
    text = predictions.example
    redacted_text = text
    entity_dict = {}
    for mask in predictions.inference:
        text_to_mask = text[int(mask['start']): int(mask['end'])]
        if len(text_to_mask) > 0:
            if mask['entity_group'] not in entity_dict:
                entity_dict[mask['entity_group']] = []
            entity_dict[mask['entity_group']].append(text_to_mask)
            
    return {"text": text, "extracted": entity_dict}

p = beam.Pipeline(options=PipelineOptions())

elements=[
    "Send the invoice to 123 Main Street, New York. Contact John Connnor at johnconnor@example.com.",
    "My name is Jessica Williams.",
]

pp = (p
      | "Create elements" >> Create(elements)
      | "RunInference" >> RunInference(model_handler)
      | "Mask Data" >> beam.Map(extract_entity) #extract_entity mask_sensetive_data
      | "Format Output" >> beam.Map(json.dumps))

ib.show(pp)

In [None]:
import json
import argparse
import re
import os
from apache_beam import Create
from typing import Tuple, Iterable
from google.cloud import storage
from apache_beam.ml.inference import RunInference
from apache_beam.ml.inference.base import PredictionResult, KeyedModelHandler, ModelHandler
from apache_beam.ml.inference.huggingface_inference import HuggingFacePipelineModelHandler, PipelineTask
from apache_beam.ml.inference.vertex_ai_inference import VertexAIModelHandlerJSON
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam as beam

from transformers import BertTokenizerFast, BertForTokenClassification, pipeline

class BertNerPipelineHandler(ModelHandler):
    def __init__(self, model_path):
        self.model_path = model_path
        self.pipeline = None
        self.local_folder = "./tmp_ner_finetuned"

    def load_model(self):
        """Loads the Transformer pipeline model."""
        # Load the fine-tuned model and tokenizer
        if self.pipeline is None:
            client = storage.Client()
            bucket = client.bucket(self.model_path)
            blobs = bucket.list_blobs(prefix="")
            if not os.path.exists(self.local_folder):
                os.makedirs(self.local_folder )
            for blob in blobs:
                file_path = os.path.join(self.local_folder, os.path.basename(blob.name))
                blob.download_to_filename(file_path)
            tokenizer = BertTokenizerFast.from_pretrained(self.local_folder)
            model = BertForTokenClassification.from_pretrained(self.local_folder)
            self.pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first")
        return self.pipeline

    def run_inference(self, batch, model, inference_args=None):
        """Runs inference on a batch of inputs."""
        predictions = model(batch)
        return [PredictionResult(x, y) for x, y in zip(batch, predictions)]

    def get_num_bytes(self, batch):
        """Returns the estimated size of the batch."""
        return sum(len(str(x).encode('utf-8')) for x in batch)

model_handler = BertNerPipelineHandler(OUTPUT_BUCKET_NAME)

def mask_sensetive_data(predictions):
    text = predictions.example
    redacted_text = text

    for mask in predictions.inference:
        text_to_mask = text[int(mask['start']): int(mask['end'])]
        if len(text_to_mask) > 0:
            redacted_text = redacted_text.replace(text_to_mask, f"[{mask['entity_group']}]")
    return {"text": text, "redacted_text": redacted_text}


def extract_entity(predictions):
    text = predictions.example
    redacted_text = text
    entity_dict = {}
    for mask in predictions.inference:
        text_to_mask = text[int(mask['start']): int(mask['end'])]
        if len(text_to_mask) > 0:
            if mask['entity_group'] not in entity_dict:
                entity_dict[mask['entity_group']] = []
            entity_dict[mask['entity_group']].append(text_to_mask)
            
    return {"text": text} #, "extracted": entity_dict}

# Set up Beam PipelineOptions for Dataflow
# pipeline_options = PipelineOptions(
#     runner="DirectRunner",
#     project=PROJECT_ID,
#     region=LOCATION,
#     temp_location=TEMP_LOCATION,
#     staging_location=STAGING_LOCATION,
#     job_name="bert-demo-ner-inference",
#     save_main_session=True,
# )

p = beam.Pipeline(options=PipelineOptions())

output_table='genai.ner_pipeline'

elements=[
    "Send the invoice to 123 Main Street, New York. Contact John Connnor at johnconnor@example.com.",
    "My name is Jessica Williams.",
]

pp = (p
        # | 'Init' >> beam.Create([input_json_file])
        # | "Read File Content" >> beam.io.fileio.ReadMatches()
        # | "Extract Content" >> beam.Map(lambda file: file.read_utf8())
        # | "Parse JSON" >> beam.Map(json.loads)
        # | "To Records" >> beam.FlatMap()
        # | "Extract text" >> beam.Map(lambda x: x["text"])
        | "Create elements" >> Create(elements)
        | "RunInference" >> RunInference(model_handler)
        | "Postprocess" >> beam.Map(extract_entity)
        | "Format Output" >> beam.Map(json.dumps)
        | "Write BigQuery" >> beam.io.gcp.bigquery.WriteToBigQuery(
            table=output_table,
            schema='text:STRING',
            method=beam.io.gcp.bigquery.WriteToBigQuery.Method.STREAMING_INSERTS,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
        )

p.run().wait_until_finish()