## Project Setup

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
project_id = "curious-skyline-360213"
dataset_id = "CX"
pipeline_root_path = "gs://curious-skyline/Projects/CX/CXpipeline"

In [3]:
from google.cloud import bigquery
bq_client = bigquery.Client(location="us-west1", project=project_id)

In [4]:
from google.cloud import language_v1

client = language_v1.LanguageServiceClient()
encoding_type = language_v1.EncodingType.UTF8

In [5]:
import kfp
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from google_cloud_pipeline_components.v1 import bigquery as gcc_bq

from kfp.v2 import compiler
from kfp.v2.dsl import component
import google.cloud.aiplatform as aip

In [6]:
from typing import NamedTuple

In [7]:
from google_cloud_pipeline_components.v1.batch_predict_job import ModelBatchPredictOp
from google_cloud_pipeline_components.v1.model import ModelUploadOp as model_upload_op
from kfp.v2.components import importer_node

In [8]:
from datetime import datetime

## Import data for previous two hours

In [6]:
hours = [' 0', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']

In [18]:
for pair in zip(hours[: :2], hours[1: :2]):
    print(pair)

(' 0', ' 1')
(' 2', ' 3')
(' 4', ' 5')
(' 6', ' 7')
(' 8', ' 9')
('10', '11')
('12', '13')
('14', '15')
('16', '17')
('18', '19')
('20', '21')
('22', '23')


In [61]:
def load_data(hour_pair):

    query =f"""
    SELECT 
    *

    FROM `curious-skyline-360213.CX.CX_Data_{pair[0].replace(' ','0')}`

    """
    query_job = bq_client.query(query,
        #location="us-west1",
    )

    df_a = query_job.to_dataframe()



    query =f"""
    SELECT 
    *

    FROM `curious-skyline-360213.CX.CX_Data_{pair[1].replace(' ','0')}`

    """
    query_job = bq_client.query(query,
        #location="us-west1",
    )

    df_b = query_job.to_dataframe()
    

    return df_a.append(df_b)

In [62]:
df_data = load_data((' 0', ' 1'))
df_data

Unnamed: 0,Market,Ministry,Department,Type_Of_Visit,Survey_Completion_Date,Survey_Type,Survey_Project,Survey_ID,Gender,Age_Group,LTR_Facility,LTR_Doctor,Anything_Outstanding,Improve_Stay,publish_hour_UTC
0,TX,MIDET,28308,In Person,08-26-2022,HCAHPS,18590,22148,Female,18-24,1,2,Awarded as recounting their personal opinions ...,"Brought high (Ellis, 2007, p. 4). Relational e...",0
1,TN,MIDET,25540,In Person,08-26-2022,AMG,17610,16793,Female,18-24,1,3,Races 1.7% prove useful,From −9 of residual heat from the Greek,0
2,MI,TNNAS,13285,In Person,08-26-2022,AMG,29275,17373,Female,18-24,1,9,To newly-built missions which the lithosphere ...,"Lower can about approaches, scientific method ...",0
3,FL,MIDET,12688,Virtual,08-26-2022,ED,24403,27074,Female,18-24,1,4,When taken highly specialized. Peer review doe...,Elsewhere across anticipated events,0
4,TX,TNNAS,14117,Virtual,08-26-2022,OAS,18980,20761,Female,18-24,1,2,Election with of robots' limbs. It would be in...,Amsterdam: Nijgh 2.0%. Hispanics,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,MI,FLJAC,23713,In Person,08-26-2022,ED,14736,29060,Female,35-39,10,2,"Faking the year. Lori Ann Wagner, a psychother...","Of communist networks represented, along with ...",0
236,TN,TXAUS,12121,In Person,08-26-2022,HCAHPS,15496,14659,Male,35-39,10,9,"Problems. Evolutionary Norfolk. Suffolk, which...","The fifth 1993, which provided for by Law and ...",0
237,FL,TNNAS,15268,In Person,08-26-2022,AMG,25419,12410,Female,35-39,10,6,Catalogued over discussed by a young age to he...,"Art, exhibited It started when Archduke Franz ...",0
238,TN,FLJAC,20505,Virtual,08-26-2022,OAS,26318,26797,Female,35-39,10,6,Placed secession terrorism in Argentina and Br...,"Revenue than Francisco, CA: Jossey-Bass. ISBN ...",0


In [63]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Market                  240 non-null    object
 1   Ministry                240 non-null    object
 2   Department              240 non-null    int64 
 3   Type_Of_Visit           240 non-null    object
 4   Survey_Completion_Date  240 non-null    object
 5   Survey_Type             240 non-null    object
 6   Survey_Project          240 non-null    int64 
 7   Survey_ID               240 non-null    int64 
 8   Gender                  240 non-null    object
 9   Age_Group               240 non-null    object
 10  LTR_Facility            240 non-null    int64 
 11  LTR_Doctor              240 non-null    int64 
 12  Anything_Outstanding    240 non-null    object
 13  Improve_Stay            240 non-null    object
 14  publish_hour_UTC        240 non-null    object
dtypes: int

In [39]:
def get_doc_sentiment(comment):
    """
    Returns .Sentiment response from API call 
    """
    
    if comment is not None:
        return client.analyze_sentiment(request = {'document': language_v1.Document(content=comment, type_=language_v1.Document.Type.PLAIN_TEXT, language='en')}).document_sentiment
    else:
        return comment

In [46]:
def combine_frames(split_frames):
    
    df_ = split_frames[0]
    for frame in split_frames[1:]:
        df_ = df_.append(frame)
        
    return df_

In [53]:
def get_sentiment_score(response):
    if response is not np.nan:
        return response.score
    else:
        return response

def get_sentiment_magnitude(response):
    if response is not np.nan:
        return response.magnitude
    else:
        return response


In [69]:
def get_overall_sentiment(df, target_column, chunk_size=599):
    """
    df = data
    target_column = df[target_column] containing text to analyze
    chunk_size = number of requests per API call with 600/minute being the current maximum
    """
    
    #Consider only records containing comments
    df_ = df[~df[target_column].isna()]
    
    #Split data into chunks
    chunks_ = 1 + int(len(df_)/chunk_size)
    split_frames = np.array_split(df_, chunks_)
    
    #Make API call
    count = 1
    for frame in split_frames:
        t1=time.time()
        frame[f'{target_column}_sentiment'] = frame[f'{target_column}'].apply(lambda z: get_doc_sentiment(z))
        t2 = time.time()
        
        #No need to sleep when no more chunks to analyze
        sleep_time=60
        if count < chunks_:
            print(f'Completed {count} of {chunks_} in {(t2-t1):.1f}s.  Waiting for {int(sleep_time)} sec.')
            time.sleep(sleep_time)    
            count += 1
        
        if count == chunks_:
            print(f'Completed {count} of {chunks_} in {(t2-t1):.1f}s.')
           
        
        
    #Join subset of frames with API response onto the original data
    df_result = df.merge(combine_frames(split_frames)[f'{target_column}_sentiment'], left_index=True, right_index=True, how='left')
    
    df_result[f'{target_column}_sentiment_score'] = df_result[f'{target_column}_sentiment'].apply(lambda z: get_sentiment_score(z))
    df_result[f'{target_column}_sentiment_magnitude'] = df_result[f'{target_column}_sentiment'].apply(lambda z: get_sentiment_magnitude(z))
    
    return df_result.drop(f'{target_column}_sentiment', axis=1)

In [70]:
df_data_sentiment = get_overall_sentiment(df_data, 'Anything_Outstanding')

Completed 1 of 1 in 11.6s.


In [71]:
df_data_sentiment = get_overall_sentiment(df_data_sentiment, 'Improve_Stay')

Completed 1 of 1 in 11.7s.


In [72]:
df_data_sentiment

Unnamed: 0,Market,Ministry,Department,Type_Of_Visit,Survey_Completion_Date,Survey_Type,Survey_Project,Survey_ID,Gender,Age_Group,LTR_Facility,LTR_Doctor,Anything_Outstanding,Improve_Stay,publish_hour_UTC,Anything_Outstanding_sentiment_score,Anything_Outstanding_sentiment_magnitude,Improve_Stay_sentiment_score,Improve_Stay_sentiment_magnitude
0,TX,MIDET,28308,In Person,08-26-2022,HCAHPS,18590,22148,Female,18-24,1,2,Awarded as recounting their personal opinions ...,"Brought high (Ellis, 2007, p. 4). Relational e...",0,-0.5,0.5,-0.1,0.2
1,TN,MIDET,25540,In Person,08-26-2022,AMG,17610,16793,Female,18-24,1,3,Races 1.7% prove useful,From −9 of residual heat from the Greek,0,0.6,0.6,0.0,0.0
2,MI,TNNAS,13285,In Person,08-26-2022,AMG,29275,17373,Female,18-24,1,9,To newly-built missions which the lithosphere ...,"Lower can about approaches, scientific method ...",0,0.2,0.4,0.0,0.0
3,FL,MIDET,12688,Virtual,08-26-2022,ED,24403,27074,Female,18-24,1,4,When taken highly specialized. Peer review doe...,Elsewhere across anticipated events,0,0.2,0.5,-0.3,0.3
4,TX,TNNAS,14117,Virtual,08-26-2022,OAS,18980,20761,Female,18-24,1,2,Election with of robots' limbs. It would be in...,Amsterdam: Nijgh 2.0%. Hispanics,0,0.0,0.2,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,MI,FLJAC,23713,In Person,08-26-2022,ED,14736,29060,Female,35-39,10,2,"Faking the year. Lori Ann Wagner, a psychother...","Of communist networks represented, along with ...",0,0.0,0.1,-0.6,0.6
236,TN,TXAUS,12121,In Person,08-26-2022,HCAHPS,15496,14659,Male,35-39,10,9,"Problems. Evolutionary Norfolk. Suffolk, which...","The fifth 1993, which provided for by Law and ...",0,-0.1,1.0,0.1,0.1
237,FL,TNNAS,15268,In Person,08-26-2022,AMG,25419,12410,Female,35-39,10,6,Catalogued over discussed by a young age to he...,"Art, exhibited It started when Archduke Franz ...",0,0.0,0.0,0.0,0.0
238,TN,FLJAC,20505,Virtual,08-26-2022,OAS,26318,26797,Female,35-39,10,6,Placed secession terrorism in Argentina and Br...,"Revenue than Francisco, CA: Jossey-Bass. ISBN ...",0,-0.5,0.5,0.1,0.4


## Turn into a pipeline

In [127]:
@component(#output_component_file="NLP_sentiment.yaml", 
           base_image="python:3.7",
           packages_to_install = ["google-cloud-language", 
                                  "pandas", 
                                  "db-dtypes", 
                                  "google-cloud-bigquery", 
                                  "google-cloud-aiplatform", 
                                  "datetime", 
                                  "fsspec", 
                                  "gcsfs"]
          )

def get_overall_sentiment(hour_pair:list, target_column:str, chunk_size:int=599):

    project_id = "curious-skyline-360213"
    
    ### Imports ###
    import pandas as pd
    import numpy as np
    import time
    from datetime import datetime
    
    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    
    from google.cloud import language_v1
    nlp_client = language_v1.LanguageServiceClient()
    encoding_type = language_v1.EncodingType.UTF8
    
    from google.cloud import bigquery
    bq_client = bigquery.Client(location="us-west1", project=project_id)
    
    from google.cloud import aiplatform

   
    ### Helper functions ###
    def load_data_from_bq(hour_pair):
        query =f"""
        SELECT 
        *
        FROM `curious-skyline-360213.CX.CX_Data_{hour_pair[0].replace(' ','0')}`

        UNION ALL

        SELECT 
        *
        FROM `curious-skyline-360213.CX.CX_Data_{hour_pair[1].replace(' ','0')}`

        """

        query_job = bq_client.query(query,
                                    #location="us-west1",
                                    )

        return query_job.to_dataframe()

    
    def get_doc_sentiment(comment):
        """
        Returns .Sentiment response from API call or passes empty comment
        """
        if comment is not None:
            return nlp_client.analyze_sentiment(request = {'document': language_v1.Document(content=comment, 
                                                                                            type_=language_v1.Document.Type.PLAIN_TEXT, 
                                                                                            language='en')}).document_sentiment
        else:
            return comment

    def combine_frames(split_frames):

        df_ = split_frames[0]
        for frame in split_frames[1:]:
            df_ = df_.append(frame)

        return df_

    def get_sentiment_score(response):
        if response is not np.nan:
            return response.score
        else:
            return response

    def get_sentiment_magnitude(response):
        if response is not np.nan:
            return response.magnitude
        else:
            return response
    
    def process_target(df, target_column):
        #Consider only records containing comments
        df_ = df[~df[target_column].isna()]

        #Split data into chunks
        chunks_ = 1 + int(len(df_)/chunk_size)
        split_frames = np.array_split(df_, chunks_)

        #Make API call
        count = 1
        for frame in split_frames:
            t1=time.time()
            frame[f'{target_column}_sentiment'] = frame[f'{target_column}'].apply(lambda z: get_doc_sentiment(z))
            t2 = time.time()

            #No need to sleep when no more chunks to analyze
            sleep_time=60
            if count < chunks_:
                print(f'Completed {count} of {chunks_} in {(t2-t1):.1f}s.  Waiting for {int(sleep_time)} sec.')
                time.sleep(sleep_time)    
                count += 1

            if count == chunks_:
                print(f'Completed {count} of {chunks_} in {(t2-t1):.1f}s.')

        #Join subset of frames with API response onto the original data
        df_result = df.merge(combine_frames(split_frames)[f'{target_column}_sentiment'], left_index=True, right_index=True, how='left')

        df_result[f'{target_column}_sentiment_score'] = df_result[f'{target_column}_sentiment'].apply(lambda z: get_sentiment_score(z))
        df_result[f'{target_column}_sentiment_magnitude'] = df_result[f'{target_column}_sentiment'].apply(lambda z: get_sentiment_magnitude(z))
    
        return df_result

    def model_selection(df, row, ltr_target):
        """
        Assign model type based on available input.
        ltr_target either of ['LTR_Facility'] or ['LTR_Doctor']
        """
        improve_only = ["Improve_Stay_sentiment_score", "Improve_Stay_sentiment_magnitude"]
        outstanding_only = ["Anything_Outstanding_sentiment_score", "Anything_Outstanding_sentiment_magnitude"]
       
        def model_selector(row):
            #Improve only
            if np.all([pd.isna(row[item]) for item in outstanding_only + ltr_target]) & ~np.all([pd.isna(row[item]) for item in improve_only]):
                return "Improve_only"
            
            #Outstanding only
            elif np.all([pd.isna(row[item]) for item in improve_only + ltr_target]) & ~np.all([pd.isna(row[item]) for item in outstanding_only]):
                return "Outstanding_only"
        
            #All comments
            elif np.all([pd.isna(row[item]) for item in ltr_target]) & ~np.all([pd.isna(row[item]) for item in improve_only+outstanding_only]):
                return "Comments_only"
        
            #All comments + rating
            elif ~np.all([pd.isna(row[item]) for item in improve_only+outstanding_only+ltr_target]):
                return "Comments_and_LTR"
        
            #Improve and rating
            elif np.all([pd.isna(row[item]) for item in outstanding_only]) & ~np.all([pd.isna(row[item]) for item in improve_only+ltr_target]):
                return "Improve_and_LTR"
        
            #Outstanding and rating
            elif np.all([pd.isna(row[item]) for item in improve_only]) & ~np.all([pd.isna(row[item]) for item in outstanding_only+ltr_target]):
                return "Outstanding_and_LTR"
        
            else:
                return "Model_selection_error"
            
        return model_selector(row)

    
    def predict_tabular_regression_sample(
        endpoint_name: str,
        instances: List[Dict],
        location: str = 'us-west1',
        project: str = "348611359036"
        ):
    
        aiplatform.init(project=project, location=location)
        endpoint = aiplatform.Endpoint(endpoint_name)
        result = endpoint.predict(instances=instances)

        return result.predictions[0]['value']


    #Need a dictionary for endpoints and model_selection to select the right model endpoint based on available data.
    Facility_Model_Endpoint_Select = {"Comments_and_LTR": "143006880355057664"}
    Facility_Model_Columns_Select = {"Comments_and_LTR": ['Anything_Outstanding_sentiment_score', 
                                                                 'Anything_Outstanding_sentiment_magnitude', 
                                                                 'Improve_Stay_sentiment_score', 
                                                                 'Improve_Stay_sentiment_magnitude', 
                                                                 'LTR_Doctor']}
    
    
    Doctor_Model_Endpoint_Select = {"Comments_and_LTR" : "1189530843765276672"}
    Doctor_Model_Columns_Select = {"Comments_and_LTR": ['Anything_Outstanding_sentiment_score', 
                                                                 'Anything_Outstanding_sentiment_magnitude', 
                                                                 'Improve_Stay_sentiment_score', 
                                                                 'Improve_Stay_sentiment_magnitude', 
                                                                 'LTR_Facility']}


    def generate_instance(model_select, df_row, target):
        if target == 'Doctor':
            return dict(zip(Doctor_Model_Columns_Select[model_select], [item for item in df_row]) )
        else:
            return dict(zip(Facility_Model_Columns_Select[model_select], [item for item in df_row]) )


    def assign_status(value):
        
        if value >= 9:
            return "Promoter"
        elif 7<= value <=8:
            return "Neutral"
        else:
            return "Detractor"
    
        
    
    ### MAIN ###    
    
    #Process first set of comments
    df_result_AO = process_target(load_data_from_bq(hour_pair), "Anything_Outstanding")
    
    #Need to sleep, because process_target does not sleep on last run
    time.sleep(60)
    
    #Process second set of comments
    df_result = process_target(df_result_AO, "Improve_Stay")
    
    #Drop API response
    df_result.drop(["Anything_Outstanding_sentiment", "Improve_Stay_sentiment"], axis=1, inplace=True)
    
    #Model selection
    df_result['Facility_Model'] = df_result.apply(lambda z: model_selection(df_result, z, ["LTR_Facility"]), axis=1)
    df_result['Doctor_Model'] = df_result.apply(lambda z: model_selection(df_result, z, ["LTR_Doctor"]), axis=1)
    
    ### Assign P/N/D Status on survey data ###
    df_result['LTR_Doctor_Status'] = df_result['LTR_Doctor'].apply(lambda z: assign_status(z))
    df_result['LTR_Facility_Status'] = df_result['LTR_Facility'].apply(lambda z: assign_status(z))
    
    #For some reason model treats this as a string, rather than integer
    df_result['LTR_Facility'] = df_result['LTR_Facility'].apply(lambda z: str(z))
    df_result['LTR_Doctor'] = df_result['LTR_Doctor'].apply(lambda z: str(z))
    
    ### Prediction ###
    ### Serve prediction
    df_result['Facility_Instance'] = df_result.apply(lambda z: generate_instance(z['Facility_Model'], z[Facility_Model_Columns_Select[z['Facility_Model']]], 'Facility'), axis=1)
    df_result['Predict_LTR_Facility'] = df_result.apply(lambda z: predict_tabular_regression_sample(endpoint_name=Facility_Model_Endpoint_Select[z['Facility_Model']], instances=[z['Facility_Instance']] ), axis=1)
      
    df_result['Doctor_Instance'] = df_result.apply(lambda z: generate_instance(z['Doctor_Model'], z[Doctor_Model_Columns_Select[z['Doctor_Model']]], 'Doctor'), axis=1)
    df_result['Predict_LTR_Doctor'] = df_result.apply(lambda z: predict_tabular_regression_sample(endpoint_name=Doctor_Model_Endpoint_Select[z['Doctor_Model']], instances=[z['Doctor_Instance']] ), axis=1)
    
    df_result.drop(['Facility_Instance', 'Doctor_Instance'], axis=1, inplace=True)

    ### Assign P/N/D Status on predictions ###
    df_result['Predict_LTR_Doctor_Status'] = df_result['Predict_LTR_Doctor'].apply(lambda z: assign_status(z))
    df_result['Predict_LTR_Facility_Status'] = df_result['Predict_LTR_Facility'].apply(lambda z: assign_status(z))
    
    
    #Write result to BQ        
    job_config = bigquery.LoadJobConfig(
    # # Specify a (partial) schema. All columns are always written to the
    # # table. The schema is used to assist in data type definitions.
    # schema=[
    #     # Specify the type of columns whose type cannot be auto-detected. For
    #     # example the "title" column uses pandas dtype "object", so its
    #     # data type is ambiguous.
    #     bigquery.SchemaField("title", bigquery.enums.SqlTypeNames.STRING),
    #     # Indexes are written if included in the schema by name.
    #     bigquery.SchemaField("wikidata_id", bigquery.enums.SqlTypeNames.STRING),
    # ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition='WRITE_APPEND',
    # time_partitioning = {
    #                     "type":"HOUR",
    #                     "field":"publish_time"
    #                     },
    time_partitioning = bigquery.TimePartitioning(type_=bigquery.TimePartitioningType.HOUR, field='publish_time')
    )
    
    table_ref = bigquery.TableReference(dataset_ref=bigquery.DatasetReference(project=project_id, dataset_id='CX'), table_id='CX_Stream_hour_partition_processed')
    job = bq_client.load_table_from_dataframe(dataframe=df_result, 
                                              destination=table_ref, 
                                              job_config=job_config
                                             )
    job.result()  # Wait for the job to complete.

    table = bq_client.get_table(table_ref)  # Make an API request.
    print('Loaded {} rows and {} columns to {}'.format(table.num_rows, len(table.schema), table_ref))    
    
    ### Save to bucket as .csv for further processing
    df_result.to_csv(f"gs://curious-skyline/Projects/CX/CXpipeline/ProcessedData/{TIMESTAMP[:8]}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}_{TIMESTAMP[:8]}.csv")

In [21]:
@component(#output_component_file="NLP_sentiment.yaml", 
           base_image="python:3.7",
           packages_to_install = [
                                  "pandas", 
                                  "db-dtypes", 
                                  "google-cloud-bigquery", 
                                  "google-cloud-aiplatform", 
                                  "datetime", 
                                  "fsspec", 
                                  "gcsfs",
           ])

def topic_modeling(hour_pair:list):

    project_id = "curious-skyline-360213"
    
    ### Imports ###
    import pandas as pd
    import numpy as np
    import time
    from datetime import datetime
     
    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    
   
    from google.cloud import aiplatform
 
    ### Helper functions ###
    
    
    ### Load most recently created, processed hour pair data
    df_data = pd.read_csv(f"gs://curious-skyline/Projects/CX/CXpipeline/ProcessedData/{TIMESTAMP[:8]}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}_{TIMESTAMP[:8]}.csv")
    
        
    

In [9]:
df_test = pd.read_csv("gs://curious-skyline/Projects/CX/CXpipeline/ProcessedData/0001_20220907185538")

In [31]:
df_test

Unnamed: 0.1,Unnamed: 0,Market,Ministry,Department,Type_Of_Visit,Survey_Completion_Date,Survey_Type,Survey_Project,Survey_ID,Gender,...,Improve_Stay_sentiment_score,Improve_Stay_sentiment_magnitude,Facility_Model,Doctor_Model,LTR_Doctor_Status,LTR_Facility_Status,Predict_LTR_Facility,Predict_LTR_Doctor,Predict_LTR_Doctor_Status,Predict_LTR_Facility_Status
0,0,TN,TXAUS,22884,Virtual,08-26-2022,HCAHPS,14201,22379,Male,...,-0.7,0.7,Comments_and_LTR,Comments_and_LTR,Promoter,Detractor,4.707859,5.866497,Detractor,Detractor
1,1,MI,TNNAS,26089,Virtual,08-26-2022,HCAHPS,11027,15600,Male,...,-0.1,0.1,Comments_and_LTR,Comments_and_LTR,Detractor,Detractor,4.793215,5.167792,Detractor,Detractor
2,2,TX,FLJAC,15936,Virtual,08-26-2022,HCAHPS,13852,12294,Male,...,-0.1,0.1,Comments_and_LTR,Comments_and_LTR,Detractor,Detractor,4.897707,5.300736,Detractor,Detractor
3,3,MI,MIDET,24274,Virtual,08-26-2022,HCAHPS,25211,12925,Male,...,0.0,0.0,Comments_and_LTR,Comments_and_LTR,Promoter,Detractor,4.906943,6.324744,Detractor,Detractor
4,4,TN,FLJAC,27288,Virtual,08-26-2022,ED,11273,12745,Female,...,0.1,0.3,Comments_and_LTR,Comments_and_LTR,Promoter,Detractor,4.960391,6.141251,Detractor,Detractor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1065,FL,FLJAC,15746,Virtual,09-02-2022,ED,22577,21026,Female,...,0.0,0.0,Comments_and_LTR,Comments_and_LTR,Detractor,Detractor,4.886615,4.992750,Detractor,Detractor
1066,1066,FL,TXAUS,10701,Virtual,09-02-2022,ED,15666,25560,Female,...,0.0,0.0,Comments_and_LTR,Comments_and_LTR,Detractor,Neutral,4.952349,5.679932,Detractor,Detractor
1067,1067,TN,TXAUS,18090,Virtual,09-02-2022,ED,22580,22710,Female,...,0.1,0.4,Comments_and_LTR,Comments_and_LTR,Neutral,Neutral,5.047003,5.260039,Detractor,Detractor
1068,1068,TN,TNNAS,14968,Virtual,09-02-2022,OAS,10795,29894,Female,...,0.1,0.1,Comments_and_LTR,Comments_and_LTR,Neutral,Detractor,4.829739,5.462848,Detractor,Detractor


In [11]:
df_nlp = df_test.copy(deep=True)

In [50]:
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt
from plotly import graph_objects as go

import gcsfs

In [13]:
#https://github.com/hrishi-ds/Medium/blob/main/Visualize-Confusion-Matrix-Using-Sankey-Diagram/visualise-confusion-matrix-using-sankey.ipynb

def transform_confusion_matrix(cf_matrix_, targets_list=None):
    """
    function to transform confusion matrix to dataframe needed to plot Sankey chart
    
    returns a dataframe and list of unique labels for Sankey chart nodes
    
    Parameters
    --------------
    cf_matrix_ : numpy.ndarray
        The confusion matrix to be visualised
    target_list : {'list', 'numpy.ndarray'}
        List of unique classes
        
    """
    
    # create a dataframe
    
    if targets_list is None:
        df = pd.DataFrame(data=cf_matrix_, 
                          index=[f"True Class-{i+1}" for i in range(cf_matrix_.shape[0])],
                          columns=[f"Predicted Class-{i+1}" for i in range(cf_matrix_.shape[0])])
    else:
        df = pd.DataFrame(data=cf_matrix_, 
                          index=[f"Rating {i}" for i in targets_list],
                          columns=[f"Sentiment {i}" for i in targets_list])

    # restructre the dataframe
    df = df.stack().reset_index()

    # rename the default column names
    df.rename(columns={'level_0':'source', 'level_1':'target', 0:'value'}, inplace=True)

    # add new column for colour
    # here rgba(211,255,216,0.6) indicates green colour whereas rgba(245,173,168,0.6) is red colour
    # green colour illustrates correct predictions and red colour is for incorrect predictions
    df["colour"] = df.apply(lambda x: 
                              "rgba(211,255,216,0.6)" if x.source.split()[1:] == x.target.split()[1:] 
                               else "rgba(245,173,168,0.6)", axis=1)

    # extract unique values from source and target columns
    labels = pd.concat([df.source, df.target]).unique()

    # get indices of the above unique values
    labels_indices = {label:index for index, label in enumerate(labels)}
    labels_indices

    # map the source and target column using the above indices
    df[["source", "target"]] = df[["source", "target"]].applymap(lambda x: labels_indices[x])

    # create a column for tooltip
    df["tooltip"] = df.apply(lambda x:
                             f"{x['value']} {' '.join(labels[x['source']].split()[1:])} instances correctly classified as {' '.join(labels[x['target']].split()[1:])}" 
                             if x['colour']=='rgba(211,255,216,0.6)'

                             else 
                             f"{x['value']} {' '.join(labels[x['source']].split()[1:])} instances misclassified as {' '.join(labels[x['target']].split()[1:])}", axis=1)

    return df, labels
    

def plot_sankey_for_confusion_matrix(df, labels, title):
    
    """
    plots sankey diagram from given dataframe and labels
    """
    
    # plot figure
    fig = go.Figure(data=[go.Sankey(
    
    node = dict(
      pad = 20,
      thickness = 20,
      line = dict(color = "black", width = 1.0),
      label = labels,
      
      # this template will be used to display text when hovering over nodes  
      hovertemplate = "%{label} has total %{value:d} instances<extra></extra>"
    ),
    link = dict(
      source = df.source, 
      target = df.target,
      value = df.value,
      color = df.colour,
      customdata = df['tooltip'], 
        
      # this template will be used to display text when hovering over the links  
      hovertemplate = "%{customdata}<extra></extra>"  
    ))])

    fig.update_layout(title_text=title, font_size=13,
                      width=510, height=450)
    
    return fig

In [83]:
from google.cloud import storage

In [125]:
cf_matrix = confusion_matrix(df_nlp['LTR_Doctor_Status'], df_nlp['Predict_LTR_Doctor_Status'])

df,labels = transform_confusion_matrix(cf_matrix, ['Detractor', 'Neutral', 'Promoter']) 
plot_ = plot_sankey_for_confusion_matrix(df, labels, "LTR Doctor: Rating vs. Feedback Sentiment")
storage.Client(project=project_id).bucket("curious-skyline").blob(f"Projects/CX/CXpipeline/ProcessedData/{TIMESTAMP[:8]}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}/ltr_doctor_clf_snakey.png").upload_from_string(plot_.to_image(format = 'png'))


In [90]:
from io import BytesIO

In [126]:
fig = plt.figure(figsize=(6,4))

categories = ["Detractor", "Neutral", "Promoter"]
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(3,3)

sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='Blues', cbar=False, xticklabels=categories, yticklabels=categories )

plt.ylabel("Patient NPS")
plt.xlabel("NLP Predicted NPS")
#plt.savefig("ltr_doctor_clf_matrix_.png")
buffer_ = BytesIO()
_ = plt.savefig(buffer_, format='png')
#Close to prevent display
plt.close(fig)
storage.Client(project=project_id).bucket("curious-skyline").blob(f"Projects/CX/CXpipeline/ProcessedData/{TIMESTAMP[:8]}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}/ltr_doctor_clf_matrix.png").upload_from_string(buffer_.getvalue())

report = classification_report(df_nlp['LTR_Doctor_Status'], df_nlp['Predict_LTR_Doctor_Status'],  output_dict=True, zero_division=0)
report.update({"accuracy": {"precision": None, "recall": None, "f1-score": report["accuracy"], "support": report['macro avg']['support']}})

report_ = pd.DataFrame(report).T
#report_.to_csv("ltr_doctor_clf_report_.csv")
storage.Client(project=project_id).bucket("curious-skyline").blob(f"Projects/CX/CXpipeline/ProcessedData/{TIMESTAMP[:8]}/{hour_pair[0].replace(' ','0')}{hour_pair[1].replace(' ','0')}/ltr_doctor_clf_report.csv").upload_from_string(report_.to_csv())

In [128]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
hours = [' 0', ' 1', ' 2', ' 3', ' 4', ' 5', ' 6', ' 7', ' 8', ' 9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']

# create working dir to pass to job spec
WORKING_DIR = f"{pipeline_root_path}/{TIMESTAMP}"


for pair in zip(hours[: :2], hours[1: :2]):
    
    hour_pair_str = pair[0].replace(' ', '0')+pair[1].replace(' ', '0')
    @kfp.dsl.pipeline(
        name=f"cx-pipeline-nlp-sentiment-hours-{hour_pair_str}",
        description="NLP sentiment process",
        pipeline_root=pipeline_root_path)

    #Move streaming data to hourly table
    def pipeline(project_id:str, 
                 hour_pair:list):
                                
        #Cloud Natural Language API sentiment score and sentiment magnitude
        #Model selection task based on available comments and scores
        nlp_sentiment_and_model_selection = get_overall_sentiment(hour_pair, target_column = "Anything_Outstanding", chunk_size=599)
        
        #

    compiler.Compiler().compile(
        pipeline_func=pipeline,
        package_path=f'cx-pipeline-nlp-sentiment-hours-{hour_pair_str}.json'
    )


APIs imported from the v1 namespace (e.g. kfp.dsl, kfp.components, etc) will not be supported by the v2 compiler since v2.0.0



In [129]:
job = aip.PipelineJob(
    display_name="cx-pipeline-nlp-sentiment-hours-0001",
    template_path='cx-pipeline-nlp-sentiment-hours-0001.json',
    pipeline_root=pipeline_root_path,
    parameter_values={
        'project_id': project_id,
        'hour_pair': [' 0', ' 1']
    }
   
)

job.submit()

Creating PipelineJob
PipelineJob created. Resource name: projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-nlp-sentiment-hours-0001-20220907221022
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/348611359036/locations/us-central1/pipelineJobs/cx-pipeline-nlp-sentiment-hours-0001-20220907221022')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/cx-pipeline-nlp-sentiment-hours-0001-20220907221022?project=348611359036
