# Stream generator Dataflow -> PubSub

#### Install Required Libraries

In [None]:
%pip install --quiet faker

In [None]:
import logging
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam as beam
from apache_beam import Create, FlatMap, Map, ParDo, Filter, Flatten, Partition, MapTuple, FlatMapTuple
from apache_beam import Keys, Values
from apache_beam.transforms.util import WithKeys
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib
from apache_beam.ml.inference.base import RunInference
from apache_beam.ml.inference.gemini_inference import GeminiModelHandler, generate_from_string
from collections.abc import Callable
from collections.abc import Iterable
from collections.abc import Sequence
from typing import Any
from typing import Optional
from google import genai
from google.genai import errors
from apache_beam.ml.inference import utils
from apache_beam.ml.inference.base import PredictionResult
from apache_beam.ml.inference.base import RemoteModelHandler
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.runners import DataflowRunner

In [None]:
PROJECT_ID = !gcloud config list --format 'value(core.project)'
PROJECT_ID = PROJECT_ID[0]
REGION = "us-central1"
%env GOOGLE_CLOUD_PROJECT={PROJECT_ID}
BUCKET_NAME=f'dataflow_demo_{PROJECT_ID}'

In [None]:
!gsutil mb -l {REGION} gs://{BUCKET_NAME}

#### Utility methods

In [None]:
DATAFLOW_BUCKET=f"bert-ner-demo-io-storage-{PROJECT_ID}" 
OUTPUT_GCS_BUCKET = f"gs://{DATAFLOW_BUCKET}/output/"
TEMP_LOCATION=f"gs://{DATAFLOW_BUCKET}/temp/"
STAGING_LOCATION=f"gs://{DATAFLOW_BUCKET}/staging/"
OUTPUT_TOPIC=f"projects/{PROJECT_ID}/topics/input_messages"

In [None]:
!gsutil mb -l {REGION} gs://{DATAFLOW_BUCKET}

In [None]:
%%writefile ./requirements.txt
faker

In [None]:
import argparse
import json
import logging
import random
import uuid
import time
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.periodicsequence import PeriodicImpulse
from apache_beam.utils.timestamp import Timestamp
from apache_beam.io import WriteToPubSub
from apache_beam import Create

# Define the templates provided
TEMPLATES = [
    "The meeting will be hosted by {NAME} at {ADDRESS} .",
    "Please contact {NAME} at {EMAIL} for more details.",
    "My office is located at {ADDRESS} .",
    "You can reach us at {PHONE_NUMBER} or send a mail to {EMAIL} .",
    "{NAME} changed their phone number to {PHONE_NUMBER} yesterday.",
    "Ship the package to {ADDRESS} immediately.",
    "Is {EMAIL} the correct address for {NAME} ?",
    "Write down this address: {ADDRESS} .",
    "Call {PHONE_NUMBER} and ask for {NAME} .",
]

class GenerateRandomMessage(beam.DoFn):
    """
    A DoFn that ignores the input element, generates random data
    fills a template, and outputs a JSON byte string.
    """
    def setup(self):
        # Import here to ensure pickling works on Dataflow workers
        from faker import Faker
        self.fake = Faker()

    def process(self, element):
        # 1. Pick a random template
        template = random.choice(TEMPLATES)

        # 2. Generate data for all potential placeholders
        # formatting address to remove newlines for cleaner sentences
        clean_address = self.fake.address().replace('\n', ', ')
        
        replacements = {
            "NAME": self.fake.name(),
            "ADDRESS": clean_address,
            "EMAIL": self.fake.email(),
            "PHONE_NUMBER": self.fake.phone_number()
        }

        # 3. Format the string
        # Using **replacements unpacks the dict to match {KEYS} in the template
        message_text = template.format(**replacements)

        # 4. Construct the payload
        payload = {
            "id": str(uuid.uuid4()),
            "message": message_text
        }

        # 5. Log for local debugging (optional)
        logging.info(f"Generated: {payload}")

        # 6. Return as bytes (Required for Pub/Sub)
        yield json.dumps(payload).encode("utf-8")

#pipeline_options = PipelineOptions(pipeline_args, streaming=True)
p = beam.Pipeline(InteractiveRunner())
messages = (
    p
    #Generate a continuous stream of numbers.
    | "GenerateTicks" >> PeriodicImpulse(
            start_timestamp=Timestamp.now(), 
            stop_timestamp=Timestamp.of(2147483647), # Run practically forever
            fire_interval=0.5, 
            apply_windowing=True
        ) 
    # Map the ticks to our random message generator
    | "CreateMessage" >> beam.ParDo(GenerateRandomMessage())
    # Write the result to Pub/Sub
    | "WritePubSub" >> WriteToPubSub(topic=OUTPUT_TOPIC)
)

# Uncomment if you want to use Interactive Runner:
# ib.show(messages)

requirements_file="./requirements.txt"

#Uncomment if you want to use Dataflow Runner:
options = pipeline_options.PipelineOptions(
    flags={},
    project=PROJECT_ID,
    region=REGION,
    staging_location=STAGING_LOCATION,
    temp_location=TEMP_LOCATION,
    machine_type='n1-standard-4',
    max_num_workers=1,
    requirements_file=requirements_file,
    disk_size_gb=50)

pipeline_result = DataflowRunner().run_pipeline(p, options=options)
pipeline_result.wait_until_finish()