# Kubeflow Pipelines Introduction

### Import Libraries

In [415]:
import pandas as pd
import numpy as np
import kfp
import seaborn as sb
from kfp import compiler
import kfp.components as comp
import kfp.dsl as dsl


### Kubeflow Client

In [468]:
client = kfp.Client(host='https://18bbb292bf653151-dot-us-central2.pipelines.googleusercontent.com')

### Pipeline Parameters

In [434]:
OUTPUT_DIR = 'gcs://crazy-hippo-01/kubeflow_staging/'
BASE_IMAGE = 'gcr.io/deeplearning-platform-release/tf2-cpu.2-4'

### Pipeline Component - READ & TRANSFORM

In [435]:
from typing import NamedTuple
def read_transform_data() -> NamedTuple(
      'ComponentOutputs',
      [
        ('training_data', str),
        ('test_data', str),
        ('validation_data', str),
        #('product', float),
        #('mlpipeline_ui_metadata', 'UI_metadata'),
        #('mlpipeline_metrics', 'Metrics')
      ]):
    
    #Import libraries
    import pandas as pd
    import time
    from sklearn.model_selection import train_test_split
    from google.cloud.bigquery import Client, QueryJobConfig
    
    #Initiate BigQuery Client
    client = Client()
    
    query = """SELECT age, workclass, occupation, education_num, marital_status, capital_gain, income_bracket
    FROM `crazy-hippo-01.census_data_us.census_raw` 
    """
    #Run Query
    job = client.query(query)
    df = job.to_dataframe()
    
    #Drop null values in dataset
    df = df.dropna()
    
    #Create training, test and validation datasets
    train, test = train_test_split(df, test_size=0.20, random_state=42)
    train, val = train_test_split(train, test_size=0.20, random_state=42)

    #Define Staging Bucket in GCS
    BUCKET = 'gcs://crazy-hippo-01/kubeflow_staging/'
    
    #Define Datasets Names
    TRAIN_DATA = BUCKET + 'datasets/training/training{}'.format(str(int(time.time())))  + '.csv'
    TEST_DATA = BUCKET + 'datasets/testing/test{}'.format(str(int(time.time())))  + '.csv'
    VALIDATION_DATA = BUCKET + 'datasets/validation/validation{}'.format(str(int(time.time())))  + '.csv'

    #Write data to GCS Storage
    train.to_csv(TRAIN_DATA, index=False, header=True)
    test.to_csv(TEST_DATA, index=False, header=True)
    val.to_csv(VALIDATION_DATA, index=False, header=True)

    #Define outputs with namedtuple
    from collections import namedtuple
    
    return_values = namedtuple(
      'ComponentOutputs',
        ['training_data', 'test_data', 'validation_data'])
        
    return return_values(TRAIN_DATA, TEST_DATA, VALIDATION_DATA)

In [436]:
read_transform_comp = comp.create_component_from_func(
    read_transform_data, 
    base_image='python:3.7',
    output_component_file='read_transform_data.yaml',
    packages_to_install=['pandas==0.24', 
                         'google-cloud-bigquery', 
                         'pyarrow' , 
                         'gcsfs', 
                         'sklearn']
)

### Pipeline Component - PREPROCESS and TRAIN MODEL

In [437]:
def train_model(TRAIN_DATA, TEST_DATA, VALIDATION_DATA) -> NamedTuple(
      'ComponentOutputs',
      [
        ('model_path', str)
      ]):
    
    import pandas as pd
    import time
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.keras.layers.experimental import preprocessing
    
    #VARIABLES AND TRAINING PARAMETERS
    TRAINING_DATA = pd.read_csv(TRAIN_DATA)
    TESTING_DATA = pd.read_csv(TEST_DATA)
    VALIDATION_DATA = pd.read_csv(VALIDATION_DATA)
    
    BATCH_SIZE = 32
    
    print(tf.__version__)
    
    print(TRAINING_DATA.head())
    
    print(TESTING_DATA.head())
    
    print(VALIDATION_DATA.head())
    
    #TENSORFLOW DATASET FUNCTION
    def helperfunc_create_dataset(dataframe, shuffle=True, batch_size=5):
        dataframe = dataframe.copy()
        labels = dataframe.pop('income_bracket')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
        ds = ds.prefetch(batch_size)
        return ds
    
    #NORMALIZATION FUNCTION
    def helperfunc_get_normalization_layer(name, dataset):
        # Create a Normalization layer for our feature.
        normalizer = preprocessing.Normalization()

        # Prepare a Dataset that only yields our feature.
        feature_ds = dataset.map(lambda x, y: x[name])

        # Learn the statistics of the data.
        normalizer.adapt(feature_ds)

        return normalizer
    
    #CATEGORY ENCODING FUNCTION
    def helperfunc_get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
        # Create a StringLookup layer which will turn strings into integer indices
        if dtype == 'string':
            index = preprocessing.StringLookup(max_tokens=max_tokens)
        else:
            index = preprocessing.IntegerLookup(max_values=max_tokens)

        # Prepare a Dataset that only yields our feature
        feature_ds = dataset.map(lambda x, y: x[name])

        # Learn the set of possible values and assign them a fixed integer index.
        index.adapt(feature_ds)

        # Create a Discretization for our integer indices.
        encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

        # Prepare a Dataset that only yields our feature.
        feature_ds = feature_ds.map(index)

        # Learn the space of possible indices.
        encoder.adapt(feature_ds)

        # Apply one-hot encoding to our indices. The lambda function captures the
        # layer so we can use them, or include them in the functional model later.
        return lambda feature: encoder(index(feature))
    
    #CREATE TENSORFLOW DATASETS
    TRAIN_DS = helperfunc_create_dataset(TRAINING_DATA, batch_size=BATCH_SIZE)
    VALIDATION_DS = helperfunc_create_dataset(VALIDATION_DATA, shuffle=False, batch_size=BATCH_SIZE)
    TESTING_DS = helperfunc_create_dataset(TESTING_DATA, shuffle=False, batch_size=BATCH_SIZE)
    
    #CREATE PREPROCESSING LAYERS
    ALL_INPUTS = []
    ENCODED_FEATURES = []

    NUMERICAL = ['age' , 'capital_gain']
    CATEGORICAL_INT_COLS = ['education_num']
    CATEGORICAL_STRING_COLS = ['occupation', 'workclass', 'marital_status']
    TARGET = ['income_bracket']
    
    # Numeric features.
    for header in NUMERICAL:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = helperfunc_get_normalization_layer(header, TRAIN_DS)
        encoded_numeric_col = normalization_layer(numeric_col)
        ALL_INPUTS.append(numeric_col)
        ENCODED_FEATURES.append(encoded_numeric_col)
        
    # Categorical features encoded as integers.
    for header in CATEGORICAL_INT_COLS:
        categorical_int_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
        encoding_layer = helperfunc_get_category_encoding_layer(header, TRAIN_DS, dtype='int64', max_tokens=5)
        encoded_categorical_int_col = encoding_layer(categorical_int_col)
        ALL_INPUTS.append(categorical_int_col)
        ENCODED_FEATURES.append(encoded_categorical_int_col)
    
    # Categorical features encoded as string.
    for header in CATEGORICAL_STRING_COLS:
        categorical_string_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
        encoding_layer = helperfunc_get_category_encoding_layer(header, TRAIN_DS, dtype='string', max_tokens=5)
        encoded_categorical_string_col = encoding_layer(categorical_string_col)
        ALL_INPUTS.append(categorical_string_col)
        ENCODED_FEATURES.append(encoded_categorical_string_col)
    
        
    #CREATE and COMPILE MODEL
    all_features = tf.keras.layers.concatenate(ENCODED_FEATURES)
    x = tf.keras.layers.Dense(32, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(0.5)(x)
    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(ALL_INPUTS, output)
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=["accuracy"])
    
    #TRAIN MODEL
    history = model.fit(TRAIN_DS, epochs=10, validation_data=VALIDATION_DS)
    
    
    #Define Bucket in GCS for Model Storage
    BUCKET = 'gs://crazy-hippo-01/kubeflow_staging/models/'
    
    #Define Datasets Names
    MODEL_PATH = BUCKET + 'earnings_model{}'.format(str(int(time.time())))
    
    #Save model to Artifact Store for Project
    model.save(MODEL_PATH)
    
    print('Model saved to: ' + MODEL_PATH)
    
    #Define outputs with namedtuple
    from collections import namedtuple
    
    return_values = namedtuple(
      'ComponentOutputs',
        ['model_path'])
        
    return return_values(MODEL_PATH)
    
    

In [438]:
train_comp = comp.create_component_from_func(
    train_model, 
    base_image='python:3.7',
    output_component_file='train_model.yaml',
    packages_to_install=['pandas==0.24', 
                         'pyarrow' , 
                         'gcsfs' , 
                         'google-cloud-bigquery-storage',
                         'tensorflow==2.3.2']
)

### Evaluate and Validate Model

In [475]:
def evaluate_validate_model(saved_model, test_dataset, pipeline, framework) -> NamedTuple(
      'ComponentOutputs',
      [
        ('mlpipeline_metrics', 'Metrics')  
      ]):
    
    import pandas as pd
    import tensorflow as tf
    from tensorflow import keras
    
    print(saved_model, test_dataset, pipeline, framework)
    
    #HELPER FUNCTION - TENSORFLOW DATASET FUNCTION
    def helperfunc_create_dataset(dataframe, shuffle=True, batch_size=5):
        dataframe = dataframe.copy()
        labels = dataframe.pop('income_bracket')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe))
        ds = ds.batch(batch_size)
        ds = ds.prefetch(batch_size)
        return ds
    
    #LOAD TRAINED MODEL FROM ARTIFACT STORE
    reloaded_model = tf.keras.models.load_model(saved_model)
    
    #READ TESTING DATASET
    TESTING_DATA = pd.read_csv(test_dataset)

    #SET BATCG SIZE
    BATCH_SIZE = 32
    
    #CALL HELPER FUNCTION TO CREATE TENSORFLOW DATASET
    TESTING_DS = helperfunc_create_dataset(TESTING_DATA, shuffle=False, batch_size=BATCH_SIZE)
    
    #EVALUATE MODEL WITH TEST DATA
    loss, accuracy = reloaded_model.evaluate(TESTING_DS)
    
    #PRINT ACCURACY METRIC
    print("Accuracy", accuracy)
    print("Loss", loss)
    
    
    from tensorflow.python.lib.io import file_io
    import json
    
    metrics = {
      'metrics': [{
          'name': 'accuracy',
          'numberValue':  accuracy,
          'format': "PERCENTAGE",
        },{
          'name': 'loss',
          'numberValue':  float(loss),
    }]}
    
    #Write Metrics to BigQuery Table for Validation and possible promotion to Deployment
    from google.cloud.bigquery import Client, QueryJobConfig
    
    #Initiate BigQuery Client
    client = Client()
    
    #Define DML Query to Insert Metrics into BugQuery
    query = """INSERT `crazy-hippo-01.census_data_us.model_metrics_history` (model_name, pipeline, framework, accuracy, loss)
    VALUES ("{}", "{}", "{}", {}, {})  
    """.format(saved_model, pipeline, framework, accuracy, loss)
    
    #Run Query
    job = client.query(query)
    
    
    #Define outputs with namedtuple
    from collections import namedtuple
    
    return_values = namedtuple(
      'ComponentOutputs',
        ['mlpipeline_metrics'])
        
    return return_values(json.dumps(metrics))

    




In [477]:
evaluate_validate_comp = comp.create_component_from_func(
    evaluate_validate_model, 
    base_image='python:3.7',
    output_component_file='evaluate_model.yaml',
    packages_to_install=['pandas==0.24',
                         'google-cloud-bigquery',
                         'pyarrow', 
                         'gcsfs',
                         'tensorflow']
)

### Create Pipeline

In [478]:
@dsl.pipeline(
  name='Tensorflow DL model with Integrated Preprocessing (Version 1)',
  description='Binary Classification Model with Tensorflow Deep Learning and Connected Pre-processing Layers'
)
def binary_classifier_earnings(
    pipeline = 'Tensorflow DL model with Integrated Preprocessing (Version 1)',
    framework = 'Tensorflow'
    ):
   
    first_step = read_transform_comp()
    first_step.execution_options.caching_strategy.max_cache_staleness = "P0D"

   
    second_step = train_comp(first_step.outputs['training_data'], 
                             first_step.outputs['test_data'], 
                             first_step.outputs['validation_data'])
    second_step.execution_options.caching_strategy.max_cache_staleness = "P0D"
    
    
    third_step = evaluate_validate_comp(second_step.outputs['model_path'], 
                                        first_step.outputs['test_data'],
                                       pipeline, framework)
    third_step.execution_options.caching_strategy.max_cache_staleness = "P0D"



### Arguments

In [479]:
arguments = {}

### Run Pipeline

In [480]:
client.create_run_from_pipeline_func(binary_classifier_earnings, arguments=arguments, experiment_name=EXPERIMENT_NAME)

RunPipelineResult(run_id=ec1b4a70-cf21-4143-b38e-4bb9429c3146)

### Create Experiment

In [236]:
EXPERIMENT_NAME = 'Earnings per Year - Binary Classifier'

In [237]:
# Get or create an experiment
experiment = client.create_experiment(EXPERIMENT_NAME)

2021-03-28 11:05:56:INFO:Creating experiment Earnings per Year - Binary Classifier.
