In [1]:
# Importing required Libraries.

In [1]:
# Import libraries

In [1]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
import requests

from kfp.components import InputPath, OutputPath, create_component_from_func

In [2]:
# Download Dataset, Preprocess Data, Load Data

In [3]:
def download_data(load_data_path: comp.OutputPath(str())):
    
    import os
    import shutil
    import tensorflow as tf

    from pathlib import Path
    
    """
    ## Set paths to the directory's
    """
    
    #creating the preprocess directory
    if not os.path.exists(load_data_path):
        os.makedirs(load_data_path)

        dataset_file = os.path.join(load_data_path, 'aclImdb_v1.tar.gz')
        dataset_dir = os.path.join(load_data_path, 'aclImdb')
        train_dir = os.path.join(dataset_dir, 'train')
        test_dir = os.path.join(dataset_dir, 'test')

        """
        ## Load the data: IMDB movie review sentiment classification.
        """

        if not os.path.exists(dataset_file):

            url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

            dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                               untar=True, cache_dir=load_data_path,
                                               cache_subdir='')

            """
            ## Remove unused directory
            """

            remove_dir = os.path.join(train_dir, 'unsup')

            if os.path.isdir(remove_dir):
                shutil.rmtree(remove_dir)

    """
    ## Preprocessing Data
    """

    # Utils.files_preprocessing.preprocess_all_folders()         

    batch_size = 32
    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
            train_dir,
            batch_size=batch_size,
            validation_split=0.2,
            subset="training",
            seed=1337,
    )
    raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
            train_dir,
            batch_size=batch_size,
            validation_split=0.2,
            subset="validation",
            seed=1337,
    )
    raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
            test_dir, batch_size=batch_size
    )
    
    tf.data.Dataset.save(raw_train_ds, f'{load_data_path}/raw_train')
    tf.data.Dataset.save(raw_val_ds, f'{load_data_path}/raw_val')
    tf.data.Dataset.save(raw_test_ds, f'{load_data_path}/raw_test')

In [4]:
download_op = kfp.components.create_component_from_func(download_data,
                                                        output_component_file='load_data_component.yaml',
                                                        base_image="python:3.8",
                                                        packages_to_install=['tensorflow', 'pathlib'])

In [5]:
# Vectorize Data

In [27]:
def vectorize_data(max_features: int, embedding_type: str, embedding_dim: int, sequence_length: int,
                 load_data_path: comp.InputPath(str()),
                 vectorize_data_path: comp.OutputPath(str())):
    
    import os
    import pickle
    import tensorflow as tf
    
    from tensorflow.keras.layers import TextVectorization
    
    raw_train_ds = tf.data.Dataset.load(f'{load_data_path}/raw_train')
    raw_val_ds = tf.data.Dataset.load(f'{load_data_path}/raw_val')
    raw_test_ds = tf.data.Dataset.load(f'{load_data_path}/raw_test')

    # Build Vectorization Layer
    vectorization_layer = TextVectorization(
            standardize=None,
            max_tokens=max_features,
            output_mode="int",
            output_sequence_length=sequence_length,
    )

    # Now that the vocab layer has been created, call `adapt` on a text-only
    # dataset to create the vocabulary. You don't have to batch, but for very large
    # datasets this means you're not keeping spare copies of the dataset in memory.

    # Let's make a text-only dataset (no labels):
    text_ds = raw_train_ds.map(lambda x, y: x)
    # Let's call `adapt`:
    vectorization_layer.adapt(text_ds)
        
    # Vectorize the data.  
    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return  vectorization_layer(text), label

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)

    # Do async prefetching / buffering of the data for best performance on GPU.
    train_ds = train_ds.cache().prefetch(buffer_size=10)
    val_ds = val_ds.cache().prefetch(buffer_size=10)
    test_ds = test_ds.cache().prefetch(buffer_size=10)
    
    #creating the vectorize directory
    os.makedirs(vectorize_data_path, exist_ok = True)
    
    tf.data.Dataset.save(train_ds, f'{vectorize_data_path}/train')
    tf.data.Dataset.save(val_ds, f'{vectorize_data_path}/val')
    tf.data.Dataset.save(test_ds, f'{vectorize_data_path}/test')
    
    pickle.dump({'config': vectorization_layer.get_config(),
                 'weights': vectorization_layer.get_weights()}
                , open(f'{vectorize_data_path}/layer.pkl', "wb"))

In [28]:
vectorize_op = kfp.components.create_component_from_func(vectorize_data,
                                                       output_component_file='vectorize_data_component.yaml',
                                                       base_image="python:3.8",
                                                       packages_to_install=['tensorflow'])

In [29]:
# Embeddings

In [101]:
def embedding_data(max_features: int, sequence_length: int, embedding_type: str, embedding_dim: int,
                   vectorize_data_path: comp.InputPath(str()),
                   embedding_data_path: comp.OutputPath(str())):
    
    import os
    import zipfile
    import numpy as np
    import tensorflow as tf
    import pickle

    from tensorflow.keras import layers
    from tensorflow.keras.layers import TextVectorization
    from pathlib import Path
    
    # Load Vectorization Layer
    from_disk = pickle.load(open(f'{vectorize_data_path}/layer.pkl', "rb"))
    vectorization_layer = TextVectorization.from_config(from_disk['config'])
    # You have to call `adapt` with some dummy data (BUG in Keras)
    vectorization_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
    vectorization_layer.set_weights(from_disk['weights'])
    
    def choose_embeddings(max_features, embedding_dim, sequence_length, vectorization_layer, embedding_type):
        
        if embedding_type == "glove":
            return Glove.glove(max_features, embedding_dim, sequence_length, vectorization_layer)
        elif embedding_type == "standard":
            return tf_standard(max_features, embedding_dim, sequence_length)


    def tf_standard(max_features, embedding_dim, sequence_length):
        
        # A integer input for vocab indices.
        inputs = tf.keras.Input(shape=(None,), dtype="int64")

        # Next, we add a layer to map those vocab indices into a space of dimensionality
        # 'embedding_dim'
        
        embeddings = layers.Embedding(input_dim=max_features,
                             output_dim=embedding_dim,
                             input_length=sequence_length)(inputs)

        model = tf.keras.Model(inputs, embeddings)
        
        return model
    
    def glove():
        glove_dir = download_glove()

        # A integer input for vocab indices.
        inputs = tf.keras.Input(shape=(None,), dtype="int64")

        # Next, we add a layer to map those vocab indices into a space of dimensionality
        # 'embedding_dim'.
        embedding_matrix = load_glove_vectors(glove_dir, vectorization_layer, max_features, embedding_dim)
        
        embeddings = layers.Embedding(input_dim=max_features,
                             output_dim=embedding_dim,
                             input_length=sequence_length,
                             trainable=False,
                             weights=[embedding_matrix])(inputs)

        model = tf.keras.Model(inputs, embeddings)
        
        return model


    def download_glove():
        glove_path = load_data_path
        glove_dir = os.path.join(glove_path, 'glove.840B.300d')
        glove_file = os.path.join(glove_path, 'glove.840B.300d.tar.gz')

        if not os.path.exists(glove_file):
            url = "https://nlp.stanford.edu/data/glove.840B.300d.zip"

            tf.keras.utils.get_file("glove.840B.300d", url,
                                    untar=True, cache_dir=glove_path,
                                    cache_subdir='')

            with zipfile.ZipFile(glove_file, 'r') as zip_ref:
                zip_ref.extractall(glove_dir)

        glove_file = os.path.join(glove_dir, 'glove.840B.300d.txt')

        return glove_file


    def load_glove_vectors(glove_file, vectorization_layer, max_features, embedding_dim):
        voc = vectorization_layer.get_vocabulary()
        word_index = dict(zip(voc, range(len(voc))))

        """
        The archive contains text-encoded vectors of various sizes: 50-dimensional,
        100-dimensional, 200-dimensional, 300-dimensional. We'll use the 300D ones.
        Let's make a dict mapping words (strings) to their NumPy vector representation:
        """

        embeddings_index = {}
        with open(glove_file, encoding="utf8") as f:
            for line in f:
                values = line.split(' ')
                word = values[0]
                weights = np.asarray([float(val) for val in values[1:]])
                embeddings_index[word] = weights

        """
         Now, let's prepare a corresponding embedding matrix that we can use in a Keras
         `Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained
         vector for the word of index `i` in our `vectorizer`'s vocabulary.
         """

        embedding_dim = embedding_dim
        if max_features is not None:
            vocab_len = max_features
        else:
            vocab_len = len(word_index) + 1
        embedding_matrix = np.zeros((vocab_len, embedding_dim))
        oov_count = 0
        oov_words = []
        for word, idx in word_index.items():
            if idx < vocab_len:
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[idx] = embedding_vector
                else:
                    oov_count += 1
                    oov_words.append(word)

        return embedding_matrix
    
    
    embedding_layer = choose_embeddings(max_features, embedding_dim, sequence_length, vectorization_layer, embedding_type)
    
    #creating the  embedding directory
    os.makedirs(embedding_data_path, exist_ok = True)
    
    #save the layer as model
    embedding_layer.save(f'{embedding_data_path}/model.h5')

In [102]:
embedding_op = kfp.components.create_component_from_func(embedding_data,
                                                         output_component_file='embedding_data_component.yaml',
                                                         base_image="python:3.8",
                                                         packages_to_install=['tensorflow', 'numpy', 'keras'])

In [86]:
# Build model

In [134]:
def build_model(hidden_layers: int, rec_units: int, dense_units: int, dropout: float,
                max_features: int, embedding_dim: int, sequence_length: int,
                vectorize_data_path: comp.InputPath(str),
                embedding_data_path: comp.InputPath(str),
                load_data_path: comp.InputPath(str()),
                model_path: comp.OutputPath(str())):
    
    import os
    import tensorflow as tf
    import keras
    import zipfile
    import numpy as np
    import pickle


    from tensorflow.keras import layers
    from keras.layers import SimpleRNN, LSTM, GRU, Bidirectional
    from tensorflow.keras.layers import TextVectorization
    from pathlib import Path
    
    # Load Datasets
    raw_test_ds = tf.data.Dataset.load(f'{load_data_path}/raw_test')
    
    train_ds = tf.data.Dataset.load(f'{vectorize_data_path}/train')
    val_ds = tf.data.Dataset.load(f'{vectorize_data_path}/val')
    test_ds = tf.data.Dataset.load(f'{vectorize_data_path}/test') 
    
    # Load Embedding Layer as model
    
    embedding_layer = keras.models.load_model(f'{embedding_data_path}/model.h5')

    # Create Model

    x = layers.Dropout(dropout)(embedding_layer.layers[-1].output)

    x = Bidirectional(GRU(units=rec_units, activation="relu", return_sequences=True))(x)
    x = Bidirectional(GRU(units=rec_units, activation="relu", return_sequences=False))(x)

    x = layers.Dense(dense_units, activation="relu")(x)
    x = layers.Dropout(dropout)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = tf.keras.Model(inputs=embedding_layer.inputs, outputs=predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    #creating the preprocess directory
    os.makedirs(model_path, exist_ok = True)
    
    model.save(f'{model_path}/model.h5')

In [135]:
build_model_op = kfp.components.create_component_from_func(build_model,
                                                           output_component_file='build_model_component.yaml',
                                                           base_image="python:3.8",
                                                           packages_to_install=['tensorflow', 'pathlib', 'numpy', 'keras'])

In [136]:
# Train model

In [137]:
def train_model(epochs: int, batch_size: int, es: bool,
                model_path: comp.InputPath(str()),
                vectorize_data_path: comp.InputPath(str()), 
                train_path: comp.OutputPath(str())):

    import os
    import keras
    import tensorflow as tf
    from tensorflow.keras.callbacks import EarlyStopping
    
    model = keras.models.load_model(f'{model_path}/model.h5')
    
    train_ds = tf.data.Dataset.load(f'{vectorize_data_path}/train')
    val_ds = tf.data.Dataset.load(f'{vectorize_data_path}/val')
    test_ds = tf.data.Dataset.load(f'{vectorize_data_path}/test') 
    
    """
     ## Train the model
    """

    if es:
        es_callback = EarlyStopping(monitor="val_loss", patience=5, verbose=1, restore_best_weights=True)
    else:
        es_callback = None

    # Fit the model using the train and test datasets.
    model.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=es_callback, batch_size=batch_size)

    """
    ## Evaluate the model on the test set
    """
    model.evaluate(test_ds)
    
    #creating the preprocess directory
    os.makedirs(train_path, exist_ok = True)
    
    model.save(f'{train_path}/model.h5')

In [138]:
train_op = kfp.components.create_component_from_func(train_model,
                                                     output_component_file='train_model_component.yaml',
                                                     base_image="python:3.8",
                                                     packages_to_install=['tensorflow', 'keras'])

In [139]:
# Build final Model

In [140]:
def build_final_model(train_path: comp.InputPath(str()),
                      final_model_path: comp.OutputPath(str())):
    
    import os
    import keras
    import tensorflow as tf
    
    model = keras.models.load_model(f'{train_path}/model.h5')
    
    # A string input
    inputs = tf.keras.Input(shape=(1,), dtype="string")
    # Turn strings into vocab indices
    indices = vectorize_layer(inputs)
    # Turn vocab indices into predictions
    outputs = model(indices)

    # Our end to end model
    model = tf.keras.Model(inputs, outputs)
    model.compile(
        loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    
    #creating the preprocess directory
    os.makedirs(final_model_path, exist_ok = True)
    
    #saving the model 
    model.save(f'{final_model_path}/model.h5')

In [141]:
build_final_model_op = kfp.components.create_component_from_func(build_final_model,
                                                                 output_component_file='build_final_model_component.yaml',
                                                                 base_image="python:3.8",
                                                                 packages_to_install=['tensorflow', 'keras'])

In [142]:
# Play with model

In [143]:
def play_with_model(final_model_path: comp.InputPath(str())):
    
    import numpy as np
    import keras
    
    model = keras.models.load_model(f'{final_model_path}/model.h5')
    
    sample_text = 'Tesla is nice as shit'

    sentiment = model.predict(np.array([sample_text]))

    # If the prediction is >= 0.0, it is positive else it is negative.
    print(sentiment)

In [144]:
play_op = kfp.components.create_component_from_func(play_with_model,
                                                    base_image="python:3.8",
                                                    packages_to_install=['tensorflow', 'numpy', 'keras'])

In [145]:
@dsl.pipeline(
   name='Trial pipeline',
   description='An example pipeline that performs pd formation and plotting class distibution.'
)
def trial_pipeline(
                   epochs:int, 
                   batch_size:int, 
                   es:bool,
                   hidden_layers:int, 
                   rec_units:int, 
                   dense_units:int,
                   dropout:float,
                   max_features:int,
                   embedding_type: str,
                   embedding_dim: int,
                   sequence_length: int,
                   load_data_path: str,
                   vectorize_data_path:str,
                   embedding_data_path:str,
                   model_path:str,
                   train_path:str,
                   final_model_path:str,
                  ):
    download_container = download_op()
    vectorize_container = vectorize_op(max_features, embedding_type, embedding_dim, sequence_length, download_container.output)
    embedding_container = embedding_op(max_features, sequence_length, embedding_type, embedding_dim, vectorize_container.output)
    model_container = build_model_op(hidden_layers, rec_units, dense_units, dropout,  max_features, embedding_dim, sequence_length, vectorize_container.output, embedding_container.output, download_container.output)
    trained_container = train_op(epochs, batch_size, es, model_container.output, vectorize_container.output)
    final_container = build_final_model_op(trained_container.output)
    play_container = play_op(final_container.output)

In [146]:
max_features = 10000
embedding_type = "standard"
embedding_dim = 300
sequence_length = 500

hidden_layers = 1
rec_units = 128
dense_units = 128
dropout = 0.5

epochs = 100
batch_size = 512
es = True

load_data_path = "/mnt"
vectorize_data_path = "vectorize_data"
embedding_data_path = "embedding_data"
model_path = "model"
train_path = "train"
final_model_path = "final_model"

In [147]:
USERNAME = "user@example.com"
PASSWORD = "changeme"
NAMESPACE = "kubeflow-user-example-com"
HOST = 'http://istio-ingressgateway.istio-system.svc.cluster.local:80'

session = requests.Session()
response = session.get(HOST)

headers = {
    "Content-Type": "application/x-www-form-urlencoded",
}

data = {"login": USERNAME, "password": PASSWORD}
session.post(response.url, headers=headers, data=data)
session_cookie = session.cookies.get_dict()["authservice_session"]

client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
)

arguments = {"epochs":epochs,
             "batch_size":batch_size,
             "es":es,
             "hidden_layers":hidden_layers,
             "rec_units":rec_units,
             "dense_units":dense_units,
             "dropout":dropout,
             "max_features":max_features,
             "embedding_type":embedding_type,
             "embedding_dim":embedding_dim,
             "sequence_length":sequence_length,
             "vectorize_data_path":vectorize_data_path,
             "embedding_data_path":embedding_data_path,
             "load_data_path":load_data_path,
             "model_path":model_path,
             "train_path":train_path,
             "final_model_path":final_model_path,
            }

client.create_run_from_pipeline_func(pipeline_func=trial_pipeline, arguments=arguments)

RunPipelineResult(run_id=0e71f873-bc86-4fd2-9afd-912ab959e19b)