In [8]:

import kfp.dsl as dsl
import yaml
from kubernetes import client as k8s


@dsl.pipeline(
  name='Image Caption',
  description='A pipeline demonstrating reproducible steps for image caption'
)
def image_caption_pipeline(
#         csv_url="https://raw.githubusercontent.com/axsauze/reddit-classification-exploration/master/data/reddit_train.csv",
#         csv_encoding="ISO-8859-1",
#         features_column="BODY",
#         labels_column="REMOVED",
#         raw_text_path='/mnt/text.data',
#         labels_path='/mnt/labels.data',
#         clean_text_path='/mnt/clean.data',
#         spacy_tokens_path='/mnt/tokens.data',
#         tfidf_vectors_path='/mnt/tfidf.data',
#         lr_prediction_path='/mnt/prediction.data',
#         tfidf_model_path='/mnt/tfidf.model',
#         lr_model_path='/mnt/lr.model',
#         lr_c_param=0.1,
#         tfidf_max_features=10000,
#         tfidf_ngram_range=3,
#         batch_size='100'
        dataset_path="/mnt/ms-coco",
        num_examples=30000,
        epochs=20,
        training_batch_size=64,
        hidden_state_size=512,
        vocab_size=5000,
        embedding_dim=256,
        preprocessing_batch_size=16,
        preprocessing_output_dir='default',
        tokenizing_output_dir='default',
        training_output_dir='default',
        validation_output_dir='default',
    ):
    """
    Pipeline 
    """
    vop = dsl.VolumeOp(
      name='my-pvc',
      resource_name="my-pvc",
      modes=["ReadWriteMany"],
      size="1Gi"
    )

    download_step = dsl.ContainerOp(
        name='data_downloader',
        image='chaowen/image_caption_data_downloader:latest',
        command="python",
        arguments=[
            "/microservice/pipeline_step.py"
#             "--labels-path", labels_path,
#             "--features-path", raw_text_path,
#             "--csv-url", csv_url,
#             "--csv-encoding", csv_encoding,
#             "--features-column", features_column,
#             "--labels-column", labels_column
        ],
        pvolumes={"/mnt": vop.volume}
    )
    
    preprocess_step = dsl.ContainerOp(
        name='preproces',
        image='chaowen/img_caption_preprocess:latest',
        command="python",
        arguments=["/microservice/pipeline_step.py",
                  "--batch-size", preprocessing_batch_size],
        pvolumes={"/mnt": download_step.pvolume}
    )
    
    
    tokenize_step = dsl.ContainerOp(
        name='tokenize',
        image='chaowen/img_caption_tokenize:latest',
        command="python",
        arguments=["/microservice/pipeline_step.py"],
        pvolumes={"/mnt": preprocess_step.pvolume}
    )
    
    train_step = dsl.ContainerOp(
        name='train',
        image='chaowen/img_caption_train:latest',
        command="python",
        arguments=["/microservice/pipeline_step.py",
                  "--epochs", epochs,
                  "--batch-size", training_batch_size],
        pvolumes={"/mnt": tokenize_step.pvolume}
    )    
    
#     download_step.container.image_pull_policy = "Always"
#     preprocess_step.container.image_pull_policy = "Always"
#     tokenize_step.container.image_pull_policy = "Always"
    train_step.container.image_pull_policy = "Always"


#     try:
#         seldon_config = yaml.load(open("../deploy_pipeline/seldon_production_pipeline.yaml"))
#     except:
#         # If this file is run from the project core directory 
#         seldon_config = yaml.load(open("deploy_pipeline/seldon_production_pipeline.yaml"))

#     deploy_step = dsl.ResourceOp(
#         name="seldondeploy",
#         k8s_resource=seldon_config,
#         attribute_outputs={"name": "{.metadata.name}"})

#     deploy_step.after(predict_step)

if __name__ == '__main__':
  import kfp.compiler as compiler
  compiler.Compiler().compile(image_caption_pipeline, 'image_caption_volume.tar.gz')
