# Classification Model Pipeline (Earnings Model) - AI Platform Pipelines

### Kubeflow workflow orchistration and Tensorflow model creation

In [None]:
!pip3 install kfp

In [6]:
#import libraries
import kfp
from typing import NamedTuple
from google.cloud import bigquery
import logging
import datetime
import pandas as pd
import gcsfs

In [3]:
import tensorflow
from tensorflow.python.lib.io import file_io

In [4]:
tensorflow.__version__

'2.3.2'

In [7]:
def read_from_storage(raw_storage_input) -> NamedTuple('step_one_output', [('rows', float),
                                                                           ('step_one_output_data', str)
                                                                          ]):
    #import libraries
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'fs-gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])
    import pandas as pd
    from collections import namedtuple
    import gcsfs
    
    data = pd.read_csv(raw_storage_input)
    
    #Get number of row in dataset
    rows_raw_data = len(data)
    
    #Write Staging data to bucket
    write_to_storage = data.to_csv('gs://crazy-hippo-01/dataset/census_step_one.csv', index = False, header=True)
    
    step_one_output_data = 'gs://crazy-hippo-01/dataset/census_step_one.csv'

    #Return number of rows
    return(rows_raw_data, step_one_output_data)
    
    

In [9]:
read_from_storage = kfp.components.func_to_container_op(read_from_storage, 
  output_component_file='./pipeline-components/step_1_read.component')

In [10]:
def clean_data(step_one_output_data, raw_rows) -> NamedTuple('step_two_output', [('clean_rows', int),
                                                                                 ('data_preparation_data', str),
                                                                                 ('data_validation_artifact', str)
                                                                                ]):
    #import libraries
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'fs-gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])
    import pandas as pd
    from collections import namedtuple
    import gcsfs
    
    data = pd.read_csv(step_one_output_data)
    
    #Remove null values from workclass
    null = data['workclass'].isna()
    clean_data = data[-null] 
    rows_clean_data = int(len(clean_data))
    
    #Remove columns
    final_df = clean_data[['age', 'workclass', 'gender', 'occupation', 'education_num', 'marital_status', 
                           'relationship', 'capital_gain', 'income_bracket']]
    
    #Write df to csv in storage bucket
    write_to_storage = final_df.to_csv('gs://crazy-hippo-01/dataset/census_step_two.csv', index = False, header=True)
    write_artifact_to_storage = final_df.to_csv('gs://crazy-hippo-01/dataset/artifact_validation.csv', index = False, header=False)
    
    data_preparation_data = 'gs://crazy-hippo-01/dataset/census_step_two.csv'
    data_validation_artifact = 'gs://crazy-hippo-01/dataset/artifact_validation.csv'

    #Return number of rows
    return(rows_clean_data, data_preparation_data, data_validation_artifact)
    

In [12]:
clean_data = kfp.components.func_to_container_op(clean_data, 
  output_component_file='./pipeline-components/step_2_clean.component')

In [13]:
def data_preparation(clean_rows, data_preparation_data, data_validation_artifact) -> NamedTuple('step_three_output', 
                                                                 [('train_rows', int), 
                                                                  ('test_rows', int), 
                                                                  ('columns_number', int),
                                                                  ('x_train', str),
                                                                  ('x_test', str),
                                                                  ('y_train', str),
                                                                  ('y_test', str),
                                                                  ('x_val', str),
                                                                  ('y_val', str)
                                                                 ]):
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'fs-gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
    import pandas as pd
    from collections import namedtuple
    import gcsfs
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing

    #Import data
    data = pd.read_csv(data_preparation_data)
    
    #Seperate X and y values
    X = data[['age', 'workclass', 'gender', 'occupation', 'education_num', 'marital_status', 'relationship', 'capital_gain']]
    y = data[['income_bracket']]
    
    #One-hot encode data
    X = pd.get_dummies(X, prefix=['workclass', 'gender','occupation','marital_status','relationship'])
    
    #Normalize data
    scaler = preprocessing.MinMaxScaler()
    X[['age','education_num','capital_gain']] = scaler.fit_transform(X[['age','education_num','capital_gain']])
    
    #Split data in train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)
    print(X_val.shape)
    print(y_val.shape)

    #Get row data from training and test data sets
    train_rows = int(len(X_train))
    test_rows = int(len(X_test))
    columns_number = int(len(X_train.columns))
    val_rows = int(len(X_val))
    
    #Save training and test datasets in bucket
    x_train = X_train.to_csv(r'gs://crazy-hippo-01/dataset/x_train.csv', index = False, header=True)
    x_test = X_test.to_csv(r'gs://crazy-hippo-01/dataset/x_test.csv', index = False, header=True)
    y_train = y_train.to_csv(r'gs://crazy-hippo-01/dataset/y_train.csv', index = False, header=True)
    y_test = y_test.to_csv(r'gs://crazy-hippo-01/dataset/y_test.csv', index = False, header=True)
    x_val = X_val.to_csv(r'gs://crazy-hippo-01/dataset/x_val.csv', index = False, header=True)
    y_val = y_val.to_csv(r'gs://crazy-hippo-01/dataset/y_val.csv', index = False, header=True)
    
    x_train = 'gs://crazy-hippo-01/dataset/x_train.csv'
    x_test = 'gs://crazy-hippo-01/dataset/x_test.csv'
    y_train = 'gs://crazy-hippo-01/dataset/y_train.csv'
    y_test = 'gs://crazy-hippo-01/dataset/y_test.csv'
    x_val = 'gs://crazy-hippo-01/dataset/x_val.csv'
    y_val = 'gs://crazy-hippo-01/dataset/y_val.csv'
    
    return(train_rows, test_rows, columns_number, x_train, x_test, y_train, y_test, x_val, y_val)

    

In [14]:
data_preparation = kfp.components.func_to_container_op(data_preparation, 
  output_component_file='./pipeline-components/step_3_prep.component')

In [46]:
def model_training(EPOCHS : int,
                     BATCH_SIZE : int,
                     VERBOSE : int,
                     NB_CLASSES : int,
                     N_HIDDEN :int,
                     #VALIDATION_SPLIT : float,
                     INPUT_SHAPE : int,
                     x_train : str, 
                     x_test : str, 
                     y_train : str, 
                     y_test : str,
                     x_val : str,
                     y_val : str,
                     train_rows : int, 
                     test_rows : int) -> NamedTuple('step_four_output', [('test_loss', float), 
                                                                          ('test_acc', float),
                                                                          ('export_path', str),
                                                                          ('mlpipeline_ui_metadata', 'UI_metadata')]):  
    
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'fs-gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'gcsfs'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.0.0b0'])
    #subprocess.run([sys.executable, '-q', 'pip', 'install', 'pyyaml', 'h5py'])
    from collections import namedtuple
    import gcsfs
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from tensorflow.python.lib.io import file_io
    import datetime
    import json
    
    print('imports done')
    
    print(tf.__version__)
    
    print('Loading data...')
    X_train = pd.read_csv(x_train)
    X_test = pd.read_csv(x_test)
    y_train = pd.read_csv(y_train)
    y_test = pd.read_csv(y_test)
    x_val = pd.read_csv(x_val)
    y_val = pd.read_csv(y_val)
    print('Data uploaded')
    
    print(X_test.shape, y_test.shape)
    
    #Create a Tensorflow dataset
    print('Creating tensorflow datasets')
    training_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
    validation_dataset = tf.data.Dataset.from_tensor_slices((x_val.values, y_val.values))
    
    training_dataset = training_dataset.shuffle(len(X_train)).batch(BATCH_SIZE)
    test_dataset = test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE)
    validation_dataset = validation_dataset.shuffle(len(x_val)).batch(BATCH_SIZE)
    
    #Tensorboard
    log_dir = "gs://crazy-hippo-01/dataset/logs/"
    
    
    #build the model
    model = tf.keras.Sequential([
        layers.Dense(5,input_shape=(40,), activation='relu'),
        #layers.Dense(10, activation='relu'),
        layers.Dense(2, name='dense_layer_output', activation='sigmoid')
      ])

    model.compile(optimizer=tf.keras.optimizers.Adam(0.01),
                    #optimizer='SGD',
                    loss=keras.losses.BinaryCrossentropy(from_logits=True),
                    #loss='categorical_crossentropy',
                    #loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    #loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])
    
    #Train model
    model.fit(training_dataset, 
              epochs= EPOCHS,
              verbose=VERBOSE, 
              #steps_per_epoch=10,
              #validation_steps=10, 
              validation_data=validation_dataset, 
              callbacks= [
                  tf.keras.callbacks.TensorBoard(log_dir=log_dir + datetime.datetime.now().date().__str__()),
                  tf.keras.callbacks.EarlyStopping(patience=5, monitor='val_loss')
              ]
)
    
    #Metadata for Tensorboard
    metadata = {'outputs' : [{
        'type': 'tensorboard',
        'source': log_dir,
        }]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
    
    #Save model in bucket    
    #export_path = tf.contrib.saved_model.save_keras_model(
    #    model, 'gs://crazy-hippo-01/dataset/census_tf_model')
    #export_path = export_path.decode('utf-8')
    
    model.save('gs://crazy-hippo-01/dataset/census_tf_model')
    
    export_path = 'gs://crazy-hippo-01/dataset/census_tf_model'
    
    print('Model Saved.')
    
    
    #Create evaluation metrics and save to variables
    test_loss, test_acc = model.evaluate(test_dataset, steps=20)
    
    test_loss = float(test_loss)
    test_acc = float(test_acc)
    
    return(test_loss, test_acc, export_path, metadata, json.dumps(metadata))


In [47]:
model_training = kfp.components.func_to_container_op(model_training, 
  output_component_file='./pipeline-components/step_4_training.component')

In [50]:
def model_evaluation(test_loss, test_acc, export_path) -> NamedTuple('MyDivmodOutput', [('mlpipeline_metrics', 'Metrics')]):
    #import libraries
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.0.0b0'])
    import tensorflow as tf
    from tensorflow.python.lib.io import file_io
    import json
    from collections import namedtuple

    #Creating Metrics
    print("Creating JSON dump")
    metrics = {
    'metrics': [
        {
          'name': 'loss-value', # The name of the metric. Visualized as the column name in the runs table.
          'numberValue': test_loss, # The value of the metric. Must be a numeric value.
          'format': "RAW"   # The optional format of the metric.* Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
        },
        {
          'name': 'accuracy-score', # The name of the metric. Visualized as the column name in the runs table.
          'numberValue': test_acc, # The value of the metric. Must be a numeric value.
          'format': "PERCENTAGE" 
        }]
    }
    #Write JSON dump file
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)
        
    print("JSON dump done")
    
    
    return(metrics, json.dumps(metrics))
    

In [51]:
model_evaluation = kfp.components.func_to_container_op(model_evaluation, 
  output_component_file='./pipeline-components/step_5_evaluation.component')

## Pipeline definition

In [52]:
@kfp.dsl.pipeline(
  name='Classification Pipeline - Earnings Prediction',
  description='Pipeline will read files from GCS, perform data cleaning and preparation and finally train and evaluate the model.'
)
def tensorflow_census(
        #run = run_id,
        raw_storage_input_data = 'gs://crazy-hippo-01/dataset/census_train.csv',
        cleaning_input_data = 'gs://crazy-hippo-01/dataset/census_step_one.csv',
        prepp_input_data = 'gs://crazy-hippo-01/dataset/census_step_two.csv',
        EPOCHS : int = 20,
        BATCH_SIZE : int = 32,
        VERBOSE : int = 1,
        NB_CLASSES : int = 2,
        N_HIDDEN : int = 10,
        VALIDATION_SPLIT : float = 0.2
    ):
        #Step 1:
        step_one_read_from_storage = read_from_storage(raw_storage_input_data)
        step_one_read_from_storage.execution_options.caching_strategy.max_cache_staleness = "P0D"
        
        #Step 2:
        step_two_cleaning_input = clean_data(step_one_read_from_storage.outputs['step_one_output_data'], 
                                                 step_one_read_from_storage.outputs['rows'] 
                                                 )
        step_two_cleaning_input.execution_options.caching_strategy.max_cache_staleness = "P0D"

        step_two_cleaning_input.after(step_one_read_from_storage)
        
        #Step 3:
        step_three_data_preparation = data_preparation(step_two_cleaning_input.outputs['clean_rows'],
                                                       step_two_cleaning_input.outputs['data_preparation_data'],
                                                       step_two_cleaning_input.outputs['data_validation_artifact']
                                                      )
        step_three_data_preparation.execution_options.caching_strategy.max_cache_staleness = "P0D"
        step_three_data_preparation.after(step_two_cleaning_input)
        
        #Step 4:
        step_four_model_training = model_training(
                                         EPOCHS,
                                         BATCH_SIZE,
                                         VERBOSE,
                                         NB_CLASSES,
                                         N_HIDDEN,
                                         #VALIDATION_SPLIT,
                                         step_three_data_preparation.outputs['columns_number'],
                                         step_three_data_preparation.outputs['x_train'],
                                         step_three_data_preparation.outputs['x_test'],
                                         step_three_data_preparation.outputs['y_train'],
                                         step_three_data_preparation.outputs['y_test'],
                                         step_three_data_preparation.outputs['x_val'],
                                         step_three_data_preparation.outputs['y_val'],
                                         step_three_data_preparation.outputs['train_rows'],
                                         step_three_data_preparation.outputs['test_rows']  
                                    )
        step_four_model_training.execution_options.caching_strategy.max_cache_staleness = "P0D"
        step_four_model_training.after(step_three_data_preparation)
        
        
        #Step 5:
        step_five_model_evaluation = model_evaluation(step_four_model_training.outputs['test_loss'], 
                                                      step_four_model_training.outputs['test_acc'],
                                                      step_four_model_training.outputs['export_path'])
        step_five_model_evaluation.execution_options.caching_strategy.max_cache_staleness = "P0D"

        step_five_model_evaluation.after(step_four_model_training)

## Compile Pipeline & Run Pipeline

In [53]:
kfp.compiler.Compiler().compile(tensorflow_census,  
  'earnings-pipeline-ver2.zip')

In [54]:
time = datetime.datetime.now()
run_id = 'tensorflow_earnings.{}'.format(time.strftime("%Y %b %d %H:%M"))
#client = kfp.Client(host='6442b053d2d40df7-dot-us-east1.pipelines.googleusercontent.com')
client = kfp.Client(host='18bbb292bf653151-dot-us-central2.pipelines.googleusercontent.com')
classification_experiment = client.create_experiment(name='Earnings Prediction Pipeline', description='ML pipeline with Tensorflow')
my_run = client.run_pipeline(classification_experiment.id, run_id, 'earnings-pipeline-ver2.zip')