#Install Kubeflow Pipelines SDK

In [44]:
# Install the SDK
!pip3 install 'kfp>=0.1.31.2' --quiet

In [45]:
!which dsl-compile #Check if the install was successful

/usr/local/bin/dsl-compile


#Build the Components

In [46]:
import kfp
import kfp.components as comp

In [47]:
# Mount your Google drive folder on Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [48]:
# where the outputs are stored
out_dir = "/content/gdrive/My Drive/"

In [49]:
def train(data_path, model_file):
  import pickle
  import numpy as np
  import pandas as pd
  from sklearn.model_selection import train_test_split
  import tensorflow as tf

  data = "https://raw.githubusercontent.com/Fitzpatrique/stage-f-09-campaign-finance/master/data/new_project_data2.csv"
  df = pd.read_csv(data)

  X = df[['can_off_dis', 'can_zip', 'ind_con', 'net_ope_exp', 'tot_con',
       'tot_dis', 'net_con', 'ope_exp', 'tot_rec', 'can_off_id', 'can_nam_id',
       'can_off_sta_id', 'can_par_aff_id', 'can_inc_cha_ope_sea_id',
       'can_cit_id', 'can_sta_id', 'cov_dur']]
  y = df[['winner_id']]

  #Perform train test split on the data
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

  #Define the model 
  model  = tf.keras.Sequential([
                tf.keras.layers.Flatten(input_shape=(1,17)),
                tf.keras.layers.Dense(8, activation = 'relu'),
                tf.keras.layers.Dense(1, activation = 'sigmoid')
  ])

  model.compile(optimizer = 'adam', loss='binary_crossentropy', metrics =['accuracy'])


  num_epochs = 150

  history = model.fit(X_train, y_train, epochs = num_epochs,
                    validation_data = (X_test,y_test))
  
  #Save the model to the designated 
  model.save(f'{data_path}/{model_file}')

  #Save the test_data as a pickle file to be used by the predict component.
  with open(f'{data_path}/test_data', 'wb') as f:
      pickle.dump((X_test,  y_test), f)
  

In [50]:
classifier = train(out_dir, "model")

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [51]:
def predict(data_path, model_file):
    
    import pickle
    import tensorflow as tf
    from tensorflow import keras
    import numpy as np
    
    # Load the saved Keras model
    classifier = keras.models.load_model(f'{data_path}/{model_file}')

    # Load and unpack the test_data
    with open(f'{data_path}/test_data','rb') as f:
        test_data = pickle.load(f)
    # Separate the X_test from y_test.
    X_test,  y_test = test_data

    # make predictions.
    y_pred = classifier.predict(X_test)

    # create a threshold
    y_pred=(y_pred>0.5)
    
    with open(f'{data_path}/result.txt', 'w') as result:
        result.write(" Prediction: {}, Actual: {} ".format(y_pred,y_test.astype(np.bool)))
    
    print('Prediction has be saved successfully!')

In [52]:
predict(out_dir, "model")

Prediction has be saved successfully!


In [53]:
# Create train and predict lightweight components.
train_op = comp.func_to_container_op(train , base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict , base_image = "tensorflow/tensorflow:latest-gpu-py3")

#Build a Kubeflow Pipeline

In [54]:
import kfp.dsl as dsl
@dsl.pipeline(
   name='Campaign finance pipeline',
   description='A classification pipeline that performs predictions on electoral results.'
)
def camp_pipeline(
  data_path: str,
  model_file: str
):
   # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create churn training component.
    churn_training_container = train_op(data_path, model_file) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create Churn prediction component.
    churn_predict_container = predict_op(data_path, model_file) \
                                    .add_pvolumes({data_path: churn_training_container.pvolume})
    
    # Print the result of the prediction
    churn_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: churn_predict_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt']
    )

In [55]:
DATA_PATH = '/mnt'
MODEL_PATH='churn_classifier.h5'

In [56]:
pipeline_func = camp_pipeline

In [57]:
experiment_name = 'campaign_finance_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))