# Project Setup

In [66]:
import kfp
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.v1 import bigquery as gcc_bq

from kfp.v2 import compiler
from kfp.v2.dsl import component
import google.cloud.aiplatform as aip

In [59]:
project_id = "curious-skyline-360213"
dataset_id = "CX"
pipeline_root_path = "gs://curious-skyline/Projects/CX/CXpipeline"

In [60]:
from google.cloud import bigquery
client = bigquery.Client(location="us-west1", project=project_id)

In [61]:
import datetime

In [62]:
#Initialize Vertex AI SDK for Python
aip.init(project=project_id, staging_bucket=pipeline_root_path)

# Pipeline from components
https://cloud.google.com/vertex-ai/docs/pipelines/gcpc-list

## Hourly staging tables

In [32]:
hours = [' 0', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']

In [106]:
def generate_query(query_hour):

    query = f"""
    SELECT 
    Market, 
    Ministry, 
    Department, 
    Type_Of_Visit, 
    Survey_Completion_Date, 
    Survey_Type, 
    Survey_Project, 
    Survey_ID, 
    Gender, 
    Age_Group, 
    LTR_Facility, 
    LTR_Doctor, 
    Anything_Outstanding, 
    Improve_Stay,
    publish_time,
    FORMAT_TIMESTAMP('%k', publish_time) AS publish_hour_UTC

    FROM `curious-skyline-360213.CX.CX_Stream_hour_partition`

    WHERE FORMAT_TIMESTAMP('%k', publish_time) = '{query_hour}'
    """
    
    #https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationQuery
    query_options_dict = {
        "destinationTable": {
                              "projectId": "curious-skyline-360213",
                              "datasetId": "CX",
                              "tableId": f"CX_Data_{query_hour.replace(' ', '0')}",
                            },
        # "timePartitioning": {
        #                     "type":"HOUR",
        #                     "field":"publish_time"
        #                     },
        
        "writeDisposition": "WRITE_TRUNCATE" #overwrite the existing data
    }
    
    return query, query_options_dict

In [107]:
for hour in hours:

    query, query_options_dict= generate_query(hour)

    @kfp.dsl.pipeline(
        name=f"cx-pipeline-hour-{hour.replace(' ', '0')}",
        description="A simple CX pipeline",
        pipeline_root=pipeline_root_path)

    #Move streaming data to hourly table
    def pipeline(project_id: str):
        bq_new_table = gcc_bq.BigqueryQueryJobOp(
        project=project_id,
        location="us-west1",
        query = query, 
        job_configuration_query = query_options_dict
        )

    compiler.Compiler().compile(
        pipeline_func=pipeline,
        package_path=f'cx-pipeline-hour_{hour.replace(" ", "0")}.json'
    )

### Run Pipeline

In [31]:
job = aip.PipelineJob(
    display_name="cx-pipeline",
    template_path="cx-pipeline-hour_20.json",
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id
    }
)

job.submit()

Creating PipelineJob


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


PipelineJob created. Resource name: projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-20220826215043


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-20220826215043


To use this PipelineJob in another session:


INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:


pipeline_job = aiplatform.PipelineJob.get('projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-20220826215043')


INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-20220826215043')


View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/cx-pipeline-20220826215043?project=348611359036


INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/cx-pipeline-20220826215043?project=348611359036


## Data Generation

In [95]:
@component(output_component_file="fake_data_stream.yaml", 
           base_image="python:3.7",
           packages_to_install = ["google-cloud-pubsub", "essential_generators"]
          )


def simulate_stream(n_minutes:int, n_burst:int, topic_name:str):
    """
    Sends bursts of n_burst every 1-30 seconds over a period of specified n_minutes.
    """
    import json, time, random, datetime
    from google.cloud import pubsub_v1    
    publisher = pubsub_v1.PublisherClient()
    
    from essential_generators import DocumentGenerator
    gen = DocumentGenerator()
    
    project_id = "curious-skyline-360213"
    topic_name = "CX_schema_json"
    topic_path = f"projects/{project_id}/topics/{topic_name}"

    def generate_message():    
        #Enterprise Hierarchy
        def Market():
            return random.choice(['TN', 'TX', 'MI', 'FL'])
        def Ministry():
            return random.choice(['TNNAS', 'TXAUS', 'MIDET', 'FLJAC' ])
        def Department():
            return random.randint(10000, 30000)

        #Reporting hierarchy to be joined off separate source once data is processed

        #Tuchpoint metadata
        def Type_Of_Visit():
            return random.choice(['In Person', 'Virtual'])

        #Survey metadata
        def Survey_Completion_Date():
            return datetime.date.today().strftime('%m-%d-%Y')
        def Survey_Type(): 
            return random.choice(['OAS', 'AMG', 'HCAHPS', 'ED'])
        def Survey_Project(): 
            return random.randint(10000, 30000)
        def Survey_ID(): 
            return random.randint(10000, 30000)

        #Patient demographics
        def Gender():
            return random.choice(['Male', 'Female'])
        def Age_Group():
            return random.choice(['18-24', '25-29', '30-34', '35-39'])

        #Survey questions
        def Anything_Outstanding():
            return gen.gen_sentence()
        def Improve_Stay():
            return gen.gen_sentence()
        def LTR_Facility():
            return random.randint(1, 10)
        def LTR_Doctor():
            return random.randint(1, 10)

        return {"Market" : Market(),
                "Ministry": Ministry(),
                'Department': Department(),
                'Type_Of_Visit': Type_Of_Visit(),
                 'Survey_Completion_Date': Survey_Completion_Date(),
                 'Survey_Type': Survey_Type(),
                 'Survey_Project': Survey_Project(),
                 'Survey_ID': Survey_ID(),
                 'Gender': Gender(),
                 'Age_Group': Age_Group(),
                 'LTR_Facility': LTR_Facility(),
                 'LTR_Doctor': LTR_Doctor(),
                 'Anything_Outstanding': Anything_Outstanding(),
                 'Improve_Stay': Improve_Stay()            
               }
    

    t_end = time.time() + 60 * n_minutes
    n_batches = 0
    while time.time() < t_end:
        #send a burst of n_burst messages
        for _ in range(n_burst):
            publisher.publish(
                topic = topic_path,
                data = str.encode(json.dumps(generate_message())))
        
        #wait for some time from 1 to 30 seconds
        time.sleep(random.randint(1, 30))
        n_batches += 1
    
    total_messages = n_batches*n_burst
    print(f'Published {total_messages} messages.')

In [96]:
@kfp.dsl.pipeline(
    name="cx-pipeline-fake-data",
    description="Fake data generator",
    pipeline_root=pipeline_root_path)

#Move streaming data to hourly table
def pipeline(project_id:str, 
             n_minutes:int, 
             n_burst:int, 
             topic_name:str = "CX_schema_json"):
    
    fake_data_task = simulate_stream(n_minutes, n_burst, topic_name)

compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path='cx-pipeline_fake_data.json'
)
        



In [97]:
job = aip.PipelineJob(
    display_name="cx-pipeline-fake-data",
    template_path='cx-pipeline_fake_data.json',
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'n_minutes': 1,
        'n_burst': 10,
    }
   
)

job.submit()

Creating PipelineJob


INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob


PipelineJob created. Resource name: projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-fake-data-20220829164351


INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-fake-data-20220829164351


To use this PipelineJob in another session:


INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:


pipeline_job = aiplatform.PipelineJob.get('projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-fake-data-20220829164351')


INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-fake-data-20220829164351')


View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/cx-pipeline-fake-data-20220829164351?project=348611359036


INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/cx-pipeline-fake-data-20220829164351?project=348611359036
