In [1]:
# Import libraries

In [2]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
import requests

from kfp.components import InputPath, OutputPath, create_component_from_func
from typing import NamedTuple

In [3]:
# Download Dataset, Preprocess Data, Load Data

In [4]:
def download_data_IMDB(load_data_path: comp.OutputPath(str())):
    
    import os
    import shutil
    import tensorflow as tf

    from pathlib import Path
    
    """
    ## Set all paths
    """
    
    dataset_dir = os.path.join(load_data_path, 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    test_dir = os.path.join(dataset_dir, 'test')
    
    if not os.path.exists(load_data_path):
        os.makedirs(load_data_path) 
    
    """
    ## Load IMDB Data
    """
    
    dataset_file = os.path.join(load_data_path, 'aclImdb_v1.tar.gz')

    if not os.path.exists(dataset_file):

        url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

        dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                            untar=True, cache_dir=load_data_path,
                                            cache_subdir='')

        """
        ## Remove unused directory
        """

        remove_dir = os.path.join(train_dir, 'unsup')

        if os.path.isdir(remove_dir):
            shutil.rmtree(remove_dir)
                
   
    

In [5]:
download_IMDB_op = kfp.components.create_component_from_func(download_data_IMDB,
                                                             output_component_file='load_data_component.yaml',
                                                             base_image="python:3.8",
                                                             packages_to_install=['tensorflow', 'pathlib'])

In [6]:
def download_data_FP(load_data_path: comp.OutputPath(str())):
    
    import os
    import pandas as pd

    from pathlib import Path
    from datasets import load_dataset, DownloadMode
    
    """
    ## Set all paths
    """
    
    dataset_dir = os.path.join(load_data_path, 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    train_pos_dir = os.path.join(train_dir, 'pos')
    train_neg_dir = os.path.join(train_dir, 'neg')
    test_dir = os.path.join(dataset_dir, 'test')
    test_pos_dir = os.path.join(test_dir, 'pos')
    test_neg_dir = os.path.join(test_dir, 'neg')
    
    dirs = [load_data_path, dataset_dir, train_dir, train_pos_dir, train_neg_dir, test_dir, test_pos_dir, test_neg_dir]
    
    for directory in dirs:
        if not os.path.exists(directory):
            os.makedirs(directory) 
                
    """
    ## Load Financial Phrasebank Data
    """
    
    dataset_file = os.path.join(load_data_path, 'financial_phrasebank.csv')
    
    def write_to_folder():
        dataset = pd.read_csv(dataset_file)

        pos_df = dataset[dataset['label'] == 2]
        write_txt(pos_df, train_pos_dir, test_pos_dir)

        neg_df = dataset[dataset['label'] == 0]
        write_txt(neg_df, train_neg_dir, test_neg_dir)
        
    def write_txt(dataset, train_dir, test_dir):
        i = 0

        for row in dataset.values:
            if i % 2 == 1:
                filename = os.path.join(train_dir, 'FP' + str(i) + '.txt')
                f = open(filename, 'w', encoding='utf-8')
                f.write(row[0])
                f.close()
                i += 1
            elif i % 2 == 0:
                filename = os.path.join(test_dir, 'FP' + str(i) + '.txt')
                f = open(filename, 'w', encoding='utf-8')
                f.write(row[0])
                f.close()
                i += 1     
    
    if not os.path.exists(dataset_file):
        dataset = load_dataset(path='financial_phrasebank',
                               name='sentences_allagree',
                               download_mode=DownloadMode.FORCE_REDOWNLOAD)

        for split, data in dataset.items():
            data.to_csv(dataset_file, index=None)
        
        write_to_folder()

In [7]:
download_FP_op = kfp.components.create_component_from_func(download_data_FP,
                                                            output_component_file='load_data_component.yaml',
                                                            base_image="python:3.8",
                                                            packages_to_install=['pathlib', 'datasets', 'pandas'])

In [8]:
# Merge Datasets

In [9]:
def merge_data(load_data_path_IMDB: comp.InputPath(str()),
               load_data_path_FP: comp.InputPath(str()),
               merge_data_path: comp.OutputPath(str())):
    
    import shutil
    import os 
    import stat
    
    # Merge Datasets

    def copytree(src, dst, symlinks=False, ignore=None):
        if not os.path.exists(dst):
            os.makedirs(dst)
            shutil.copystat(src, dst)
        lst = os.listdir(src)
        if ignore:
            excl = ignore(src, lst)
            lst = [x for x in lst if x not in excl]
        for item in lst:
            s = os.path.join(src, item)
            d = os.path.join(dst, item)
            if symlinks and os.path.islink(s):
                if os.path.lexists(d):
                    os.remove(d)
                os.symlink(os.readlink(s), d)
                try:
                    st = os.lstat(s)
                    mode = stat.S_IMODE(st.st_mode)
                    os.lchmod(d, mode)
                except:
                    pass  # lchmod not available
            elif os.path.isdir(s):
                copytree(s, d, symlinks, ignore)
            else:
                shutil.copy2(s, d)


    os.makedirs(merge_data_path, exist_ok = True)            
                
    copytree(load_data_path_IMDB, merge_data_path)  
    copytree(load_data_path_FP, merge_data_path) 

In [10]:
merge_op = kfp.components.create_component_from_func(merge_data,
                                                     output_component_file='merge_data_component.yaml',
                                                     base_image="python:3.8",
                                                     packages_to_install=[])

In [11]:
# Preprocessing Data

In [12]:
def preprocess_data(merge_data_path: comp.InputPath(str()),
                    preprocess_data_path: comp.OutputPath(str())):
    
    import tensorflow as tf
    import contractions
    import emoji
    import nltk
    import os
    import re
    
    from pathlib import Path
    from string import punctuation
    from nltk.stem import WordNetLemmatizer
    from spellchecker import SpellChecker
    from typing import List, Union
    from nltk import word_tokenize
    
    dataset_dir = os.path.join(merge_data_path, 'aclImdb')
    train_dir = os.path.join(dataset_dir, 'train')
    test_dir = os.path.join(dataset_dir, 'test')
    
    # Download data for preprocessing
    def downloadNLTK():
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('omw-1.4')
    
    downloadNLTK()
    
    # Preprocess
    def preprocess(text):
        text = replace_companies(text)
        text = remove_html_tags(text)
        text = replace_url(text)
        text = replace_emojis(text)
        text = replace_atUser(text)
        text = replace_smiley(text)
        text = replace_emojis(text)
        text = remove_leetspeak(text)
        text = check_spelling(text, lang='en')
        text = replace_contractions(text)
        text = remove_punct(text)
        text = replace_numbers(text)
        text = to_lower(text)
        text = lemmatize(text)
        text = clean_white_space(text)

        return text
    
    def remove_html_tags(text):
        return re.sub('<[^<]+?>', '', text)
    
    def replace_url(text):
        text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text)
        text = re.sub(r'#([^\s]+)', r'\1', text)
        return text
    
    def replace_atUser(text):
        text = re.sub('@[^\s]+', 'atUser', text)
        return text
    
    def remove_leetspeak(text):
        return re.sub(r"[A-Za-z]+\d+[A-Za-z]+|\d+[A-Za-z]+\d+|[A-Za-z]+\d+|\d+[A-Za-z]+", '', text).strip()
    
    def replace_numbers(text):
        return re.sub(r"\b\d+\b", "number", text)
    
    def replace_contractions(text):
        expanded_words = []
        for word in text.split():
            expanded_words.append(contractions.fix(word))

        return ' '.join(expanded_words)
    
    def remove_punct(text):
        return ''.join(c for c in text if c not in punctuation)
    

    def lemmatize(text):
        lemmatizer = WordNetLemmatizer()

        lemmatized_word = [lemmatizer.lemmatize(word) for sent in nltk.sent_tokenize(text) for word in
                           nltk.word_tokenize(sent)]
        return " ".join(lemmatized_word)
    
    def check_spelling(input_text_or_list: Union[str, List[str]], lang='en'):
        """ Check and correct spellings of the text list """
        if input_text_or_list is None or len(input_text_or_list) == 0:
            return ''
        spelling_checker = SpellChecker(language=lang, distance=1)
        spelling_checker.word_frequency.load_words(["Elon", "Musk"])


        if isinstance(input_text_or_list, str):
            if not input_text_or_list.islower():
                input_text_or_list = input_text_or_list.lower()
            tokens = word_tokenize(input_text_or_list)
        else:
            tokens = [token.lower() for token in input_text_or_list if token is not None and len(token) > 0]
        misspelled = spelling_checker.unknown(tokens)
        for word in misspelled:
            tokens[tokens.index(word)] = spelling_checker.correction(word)

        return ' '.join(filter(lambda x: str(x) if x is not None else '', tokens)).strip()

    def replace_emojis(text):
        # Group by same meaning
        text = re.sub('[\U0001F550-\U0001F567]', " of the clock ", text)

        # Replace emojis by their short text
        text = emoji.demojize(text)

        # Remove everything between emoji
        text = re.sub(
            r"(?<=:[a-zA-Z])(.*?)(?=:)",
            lambda g: "{}".format(re.sub(r"[^a-zA-Z]", "", g.group(1))),
            text,
        )

        # Remove : at the beginning and the end of an emoji
        text = text.replace(":", " ")

        return text 
    
    def replace_smiley(text):
        """
        Remove smileys
        Sources: https://de.wiktionary.org/wiki/Verzeichnis:International/Smileys
                 https://en.wiktionary.org/wiki/Appendix:Emoticons
        """

        SMILEYS = {
            ":)": "smile",
            ":-)": "smile",
            ":^)": "",
            ":-]": "smile",
            "=]": "smile",
            ":]": "smile",
            ":D": "",
            ":-D": "",
            ":))": "",
            ";-]": "",
            ";o)": "",
            "¦)": "",
            "=:)": "",
            ":9": "",
            "c:": "",
            ":'D": "",
            "xD": "",
            "XD": "",
            "B)": "",
            "B-)": "",
            "8)": "",
            "8-)": "",
            "=8)": "",
            "=8^)": "",
            "=B)": "",
            "=B^)": "",
            "~8D": "",
            "y=)": "",
            ">:)": "",
            ">:D": "",
            ">:>": "",
            ">:[]": "",
            "^_^": "",
            "^-^": "",
            "^.^": "",
            "^,^": "",
            "^^": "",
            "^^'": "",
            "^^°": "",
            "^////^": "",
            "^o^": "",
            "^O^": "",
            "^0^": "",
            "\o/": "",
            "<o/": "",
            "<(^.^)>": "",
            "-^_^-": "",
            "*(^_^)*": "",
            "*0*": "",
            "Ü": "",
            "*~*": "",
            ":>": "",
            ":i": "",
            "l:": "",
            ":(": "sad",
            ":c": "sad",
            ":[": "sad",
            "=(": "sad",
            "=[": "sad",
            ":'(": "",
            ":,(": "",
            ";(": "",
            ";_;": "",
            "T.T": "",
            "T_T": "",
            "Q_Q": "",
            ":S": "",
            ":-/": "",
            ":/": "",
            ":-I": "",
            ">:(": "",
            ">:o": "",
            ">:O": "",
            ">:@": "",
            "DX": "",
            ":-E3": "",
            "x_X": "",
            "X_x": "",
            "x_x": "",
            "x.X": "",
            "X.x": "",
            "x.x": "",
            "°_°": "",
            ">.<": "",
            ">,<": "",
            "-.-": "",
            "-,-": "",
            "-_-": "",
            "._.": "",
            "^_°'": "",
            "^,°'": "",
            "Oo": "",
            "oO": "",
            "O.o'": "",
            "cO": "",
            "ô_o": "",
            "Ô_ô": "",
            "D:": "",
            "D8<": "",
            "O_O": "surprised",
            "Ò_Ó": "",
            "U_U": "",
            "v_v": "",
            ":<": "",
            "m(": "",
            "°^°": "",
            "(@_@)": "",
            ";.;": "",
            ";)": "",
            ";-)": "",
            "^.-": "",
            ":§": "",
            ";D": "",
            ";-D": "",
            ":P": "",
            ":p": "",
            "c[=": "",
            ":p~~~~~~": "",
            ":-*": "kiss",
            ":*": "kiss",
            ";*": "",
            ":-x": "",
            "C:": "",
            ":o": "",
            ":-o": "",
            ":O": "",
            "0:-)": "",
            "O:-)": "",
            "3:)": "",
            "3:D": "",
            "-.-zZz": "",
            "(o)_(o)": "",
            "($)_($)": "",
            "^_-": "",
            "//.o": "",
            "^w^": "",
            "=^_^=": "",
            "x3": "",
            "*_*": "",
            "#-)": "",
            "`*,...ò_Ó...,*´": "",
            ":-{}": "",
            ":ö": "",
            "û_û": "",
            "Ö_Ö": "",
            ":o)": "",
            "cB": "",
            "BD": "",
            "Y_": "",
            ":-€": "",
            ":3": "",
            "x'DD": "",
            "l/l": "",
            ":o)>": "",
            "(_8(I)": "",
            "//:=|": "",
            "<3": "",
            "</3": "",
            "<'3": "",
            "<°(((><": "",
            "<°{{{><": "",
            "<°++++<": "",
            ">)))°>": "",
            "o=(====>": "",
            "@>--}---": "rose",
            "@>-`-,--": "rose",
            "(_|::|_)": "",
            "c(_)": "",
            "[:|]": "",
            "(°oo°)": "",
            "(.)(.)": "",
            "( . Y . )": "",
            "( . )": "",
            "| . |": "",
            ").(": "",
            "(_i_)": "",
            "( Y )": "",
            "8===D": "penis"
        }
        text = text.split()
        reformed = [SMILEYS[word] if word in SMILEYS else word for word in text]
        return " ".join(reformed)
    
    def replace_companies(text):
        COMPANIES = {
            "Apple": "company",
            "Biohit": "company",
            "Componenta": "company",
            "Facebook": "company",
            "Finnish Aktia Group": "company",
            "Finnish Bank of +àland": "company",
            "Finnlines": "company",
            "Fiskars": "company",
            "Google": "company",
            "HELSINKI ( AFX )": "company",
            "HKScan": "company",
            "Kemira": "company",
            "MegaFon": "company",
            "Metso Minerals": "company",
            "Microsoft": "company",
            "Nokia Corp.": "company",
            "Nordea Group": "company",
            "Ponsse": "company",
            "Ramirent": "company",
            "Ruukki": "company",
            "Sanoma Oyj HEL": "company",
            "Talentum": "company",
            "Teleste Oyj HEL": "company",
            "TeliaSonera TLSN": "company",
            "Tesla": "company",
            "Tiimari": "company",
            "Vaahto Group": "company",
        }
        text = text.split()
        reformed = [COMPANIES[word] if word in COMPANIES else word for word in text]
        return " ".join(reformed)
    
    def to_lower(text):
        return text.lower()

    def clean_white_space(text):
        return re.sub(' +', ' ', text)
    
    # Change in all folders
    def preprocess_all_folders(dataset_dir):

        dataset_train_dir = os.path.join(dataset_dir, 'train')

        dataset_train_dir_pos = os.path.join(dataset_train_dir, 'pos')
        files = os.listdir(dataset_train_dir_pos)
        preprocess_all_files(dataset_train_dir_pos, files)

        dataset_train_dir_neg = os.path.join(dataset_train_dir, 'neg')
        files = os.listdir(dataset_train_dir_neg)
        preprocess_all_files(dataset_train_dir_neg, files)

        dataset_test_dir = os.path.join(dataset_dir, 'test')

        dataset_test_dir_pos = os.path.join(dataset_test_dir, 'pos')
        files = os.listdir(dataset_test_dir_pos)
        preprocess_all_files(dataset_test_dir_pos, files)

        dataset_test_dir_neg = os.path.join(dataset_test_dir, 'neg')
        files = os.listdir(dataset_test_dir_neg)
        preprocess_all_files(dataset_test_dir_neg, files)


    def preprocess_all_files(path, files):
        for file in files:
            newfile = os.path.join(path, file)
            replaceAll(newfile)


    def replaceAll(file_name):
        with open(file_name, 'r', encoding="utf-8") as file:
            text = file.read()  # read file into memory

        text = preprocess(text)  # make replacements

        with open(file_name, 'w', encoding="utf-8") as file:
            file.write(text)  # rewrite the file

    preprocess_all_folders(dataset_dir)        
            
    # Split Datasets
    batch_size = 32
    raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
                    train_dir,
                    batch_size=batch_size,
                    validation_split=0.2,
                    subset="training",
                    seed=1337
    )
    
    raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
                    train_dir,
                    batch_size=batch_size,
                    validation_split=0.2,
                    subset="validation",
                    seed=1337
    )
    
    raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
                     test_dir, batch_size=batch_size
    )
    
    os.makedirs(preprocess_data_path, exist_ok = True)
    
    tf.data.Dataset.save(raw_train_ds, f'{preprocess_data_path}/raw_train')
    tf.data.Dataset.save(raw_val_ds, f'{preprocess_data_path}/raw_val')
    tf.data.Dataset.save(raw_test_ds, f'{preprocess_data_path}/raw_test')

In [13]:
preprocess_op = kfp.components.create_component_from_func(preprocess_data,
                                                          output_component_file='preprocess_data_component.yaml',
                                                          base_image="python:3.8",
                                                          packages_to_install=['tensorflow==2.10', 'nltk', 'contractions', 'emoji', 'typing', 
                                                                               'pyspellchecker', 'spacy', 'gensim'])

In [14]:
# Vectorize Data

In [15]:
def vectorize_data(max_features: int, embedding_type: str, embedding_dim: int, sequence_length: int,
                   preprocess_data_path: comp.InputPath(str()),
                   vectorize_data_path: comp.OutputPath(str())):
    
    import os
    import pickle
    import tensorflow as tf
    
    from tensorflow.keras.layers import TextVectorization
    
    raw_train_ds = tf.data.Dataset.load(f'{preprocess_data_path}/raw_train')
    raw_val_ds = tf.data.Dataset.load(f'{preprocess_data_path}/raw_val')
    raw_test_ds = tf.data.Dataset.load(f'{preprocess_data_path}/raw_test')

    # Build Vectorization Layer
    vectorization_layer = TextVectorization(
            standardize=None,
            max_tokens=max_features,
            output_mode="int",
            output_sequence_length=sequence_length,
    )

    # Now that the vocab layer has been created, call `adapt` on a text-only
    # dataset to create the vocabulary. You don't have to batch, but for very large
    # datasets this means you're not keeping spare copies of the dataset in memory.

    # Let's make a text-only dataset (no labels):
    text_ds = raw_train_ds.map(lambda x, y: x)
    # Let's call `adapt`:
    vectorization_layer.adapt(text_ds)
        
    # Vectorize the data.  
    def vectorize_text(text, label):
        text = tf.expand_dims(text, -1)
        return  vectorization_layer(text), label

    train_ds = raw_train_ds.map(vectorize_text)
    val_ds = raw_val_ds.map(vectorize_text)
    test_ds = raw_test_ds.map(vectorize_text)

    # Do async prefetching / buffering of the data for best performance on GPU.
    train_ds = train_ds.cache().prefetch(buffer_size=10)
    val_ds = val_ds.cache().prefetch(buffer_size=10)
    test_ds = test_ds.cache().prefetch(buffer_size=10)
    
    #creating the vectorize directory
    os.makedirs(vectorize_data_path, exist_ok = True)
    
    tf.data.Dataset.save(train_ds, f'{vectorize_data_path}/train')
    tf.data.Dataset.save(val_ds, f'{vectorize_data_path}/val')
    tf.data.Dataset.save(test_ds, f'{vectorize_data_path}/test')
    
    pickle.dump({'config': vectorization_layer.get_config(),
                 'weights': vectorization_layer.get_weights()}
                , open(f'{vectorize_data_path}/layer.pkl', "wb"))

In [16]:
vectorize_op = kfp.components.create_component_from_func(vectorize_data,
                                                       output_component_file='vectorize_data_component.yaml',
                                                       base_image="python:3.8",
                                                       packages_to_install=['tensorflow==2.10'])

In [17]:
# Embeddings

In [18]:
def embedding_data(max_features: int, sequence_length: int, embedding_type: str, embedding_dim: int,
                   vectorize_data_path: comp.InputPath(str()),
                   embedding_data_path: comp.OutputPath(str())):
    
    import os
    import zipfile
    import numpy as np
    import tensorflow as tf
    import pickle

    from tensorflow.keras import layers
    from tensorflow.keras.layers import TextVectorization
    from pathlib import Path
    
    # Load Vectorization Layer
    from_disk = pickle.load(open(f'{vectorize_data_path}/layer.pkl', "rb"))
    vectorization_layer = TextVectorization.from_config(from_disk['config'])
    # You have to call `adapt` with some dummy data (BUG in Keras)
    vectorization_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
    vectorization_layer.set_weights(from_disk['weights'])
    
    def choose_embeddings(max_features, embedding_dim, sequence_length, vectorization_layer, embedding_type):
        
        if embedding_type == "glove":
            return glove(max_features, embedding_dim, sequence_length, vectorization_layer)
        elif embedding_type == "standard":
            return tf_standard(max_features, embedding_dim, sequence_length)


    def tf_standard(max_features, embedding_dim, sequence_length):
        
        # A integer input for vocab indices.
        inputs = tf.keras.Input(shape=(None,), dtype="int64")

        # Next, we add a layer to map those vocab indices into a space of dimensionality
        # 'embedding_dim'
        
        embeddings = layers.Embedding(input_dim=max_features,
                             output_dim=embedding_dim,
                             input_length=sequence_length)(inputs)

        model = tf.keras.Model(inputs, embeddings)
        
        return model
    
    def glove(max_features, embedding_dim, sequence_length, vectorization_layer):
        glove_dir = download_glove()

        # A integer input for vocab indices.
        inputs = tf.keras.Input(shape=(None,), dtype="int64")

        # Next, we add a layer to map those vocab indices into a space of dimensionality
        # 'embedding_dim'.
        embedding_matrix = load_glove_vectors(glove_dir, vectorization_layer, max_features, embedding_dim)
        
        embeddings = layers.Embedding(input_dim=max_features,
                             output_dim=embedding_dim,
                             input_length=sequence_length,
                             trainable=False,
                             weights=[embedding_matrix])(inputs)

        model = tf.keras.Model(inputs, embeddings)
        
        return model


    def download_glove():
        #creating the  embedding directory
        os.makedirs(embedding_data_path, exist_ok = True)
        
        glove_path = embedding_data_path
        glove_dir = os.path.join(glove_path, 'glove.840B.300d')
        glove_file = os.path.join(glove_path, 'glove.840B.300d.tar.gz')

        if not os.path.exists(glove_file):
            url = "https://nlp.stanford.edu/data/glove.840B.300d.zip"

            tf.keras.utils.get_file("glove.840B.300d", url,
                                    untar=True, cache_dir=glove_path,
                                    cache_subdir='')

            with zipfile.ZipFile(glove_file, 'r') as zip_ref:
                zip_ref.extractall(glove_dir)

        glove_file = os.path.join(glove_dir, 'glove.840B.300d.txt')

        return glove_file


    def load_glove_vectors(glove_file, vectorization_layer, max_features, embedding_dim):
        voc = vectorization_layer.get_vocabulary()
        word_index = dict(zip(voc, range(len(voc))))

        """
        The archive contains text-encoded vectors of various sizes: 50-dimensional,
        100-dimensional, 200-dimensional, 300-dimensional. We'll use the 300D ones.
        Let's make a dict mapping words (strings) to their NumPy vector representation:
        """

        embeddings_index = {}
        with open(glove_file, encoding="utf8") as f:
            for line in f:
                values = line.split(' ')
                word = values[0]
                weights = np.asarray([float(val) for val in values[1:]])
                embeddings_index[word] = weights

        """
         Now, let's prepare a corresponding embedding matrix that we can use in a Keras
         `Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained
         vector for the word of index `i` in our `vectorizer`'s vocabulary.
         """

        embedding_dim = embedding_dim
        if max_features is not None:
            vocab_len = max_features
        else:
            vocab_len = len(word_index) + 1
        embedding_matrix = np.zeros((vocab_len, embedding_dim))
        oov_count = 0
        oov_words = []
        for word, idx in word_index.items():
            if idx < vocab_len:
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[idx] = embedding_vector
                else:
                    oov_count += 1
                    oov_words.append(word)

        return embedding_matrix
    
    
    embedding_layer = choose_embeddings(max_features, embedding_dim, sequence_length, vectorization_layer, embedding_type)
    
    #creating the  embedding directory
    os.makedirs(embedding_data_path, exist_ok = True)
    
    #save the layer as model
    embedding_layer.save(f'{embedding_data_path}/model.h5')

In [19]:
embedding_op = kfp.components.create_component_from_func(embedding_data,
                                                         output_component_file='embedding_data_component.yaml',
                                                         base_image="python:3.8",
                                                         packages_to_install=['tensorflow==2.10', 'numpy', 'keras==2.10'])

In [20]:
# Build model

In [21]:
def build_model(hidden_layers: int, rec_units: int, dense_units: int, dropout: float,
                max_features: int, embedding_dim: int, sequence_length: int,
                learning_rate: float, epsilon: float, clipnorm: float,
                vectorize_data_path: comp.InputPath(str),
                embedding_data_path: comp.InputPath(str),
                preprocess_data_path: comp.InputPath(str()),
                model_path: comp.OutputPath(str())):
    
    import os
    import tensorflow as tf
    import keras
    import zipfile
    import numpy as np
    import pickle


    from tensorflow.keras import layers
    from keras.layers import SimpleRNN, LSTM, GRU, Bidirectional
    from tensorflow.keras.layers import TextVectorization
    from pathlib import Path
    
    # Load Datasets
    raw_test_ds = tf.data.Dataset.load(f'{preprocess_data_path}/raw_test')
    
    train_ds = tf.data.Dataset.load(f'{vectorize_data_path}/train')
    val_ds = tf.data.Dataset.load(f'{vectorize_data_path}/val')
    test_ds = tf.data.Dataset.load(f'{vectorize_data_path}/test') 
    
    # Load Embedding Layer as model
    
    embedding_layer = keras.models.load_model(f'{embedding_data_path}/model.h5')

    # Create Model

    x = layers.Dropout(dropout)(embedding_layer.layers[-1].output)

    if hidden_layers > 1:
        for i in range(1, hidden_layers):
            x = LSTM(units=self.rec_units, activation="relu", return_sequences=True)(x)

    x = LSTM(units=self.rec_units, activation="relu")(x)

    x = layers.Dense(dense_units, activation="relu")(x)
    x = layers.Dropout(dropout)(x)

    # We project onto a single unit output layer, and squash it with a sigmoid:
    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = tf.keras.Model(inputs=embedding_layer.inputs, outputs=predictions)

    # Compile the model with binary crossentropy loss and an adam optimizer.
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm), 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[tf.metrics.BinaryAccuracy(),
                           tf.keras.metrics.Precision(thresholds=0),
                           tf.keras.metrics.Recall(thresholds=0)])
    
    #creating the preprocess directory
    os.makedirs(model_path, exist_ok = True)
    
    model.save(f'{model_path}/model.h5')

In [22]:
build_model_op = kfp.components.create_component_from_func(build_model,
                                                           output_component_file='build_model_component.yaml',
                                                           base_image="python:3.8",
                                                           packages_to_install=['tensorflow==2.10', 'pathlib', 'numpy', 'keras==2.10'])

In [23]:
# Train model

In [24]:
def train_model(epochs: int, batch_size: int, es: bool,
                model_path: comp.InputPath(str()),
                vectorize_data_path: comp.InputPath(str()), 
                train_path: comp.OutputPath(str())) -> NamedTuple(
                'VisualizationOutput', [
                    ('echo', 'string'), 
                    ('mlpipeline_ui_metadata', 'UI_metadata')
                ]):

    import os
    import keras
    import json
    import tensorflow as tf
    from tensorflow.keras.callbacks import EarlyStopping
    
    model = keras.models.load_model(f'{model_path}/model.h5')
    
    train_ds = tf.data.Dataset.load(f'{vectorize_data_path}/train')
    val_ds = tf.data.Dataset.load(f'{vectorize_data_path}/val')
    test_ds = tf.data.Dataset.load(f'{vectorize_data_path}/test') 
    
    """
     ## Train the model
    """

    if es:
        es_callback = EarlyStopping(monitor="val_loss", patience=5, verbose=1, restore_best_weights=True)
    else:
        es_callback = None

    # Fit the model using the train and test datasets.
    model.fit(train_ds, validation_data=val_ds, epochs=epochs, callbacks=es_callback, batch_size=batch_size)
    
    #creating the preprocess directory
    os.makedirs(train_path, exist_ok = True)
    
    model.save(f'{train_path}/model.h5')
    
    """
    ## Evaluate the model on the test set
    """
    loss, binary_accuracy, precision, recall = model.evaluate(test_ds)
    f1_score = 2 * ((precision * recall)/(precision + recall))
    
    # visualization of the results
    visualization = f"""
    <html>
        <body>
            <table>
                <thead>
                  <tr>
                    <th>Binary Accuracy</th>
                    <th>"{binary_accuracy}"</th>
                  </tr>
                </thead>
                <tbody>
                  <tr>
                    <td>Precision<br></td>
                    <td>"{precision}"</td>
                  </tr>
                  <tr>
                    <td>Recall<br></td>
                    <td>"{recall}"</td>
                  </tr>
                  <tr>
                    <td>F1 Score<br></td>
                    <td>"{f1_score}"</td>
                  </tr>
                </tbody>
            </table>
        </body>
    </html>
    """
    metadata = {
        'outputs': [{
            'storage': 'inline',
            'source': visualization,
            'type': 'web-app',
        }]
    }
    from collections import namedtuple
    output = namedtuple('VisualizationOutput', ['echo', 'mlpipeline_ui_metadata'])
    return output('Visualization', json.dumps(metadata))   

In [25]:
train_op = kfp.components.create_component_from_func(train_model,
                                                     output_component_file='train_model_component.yaml',
                                                     base_image="python:3.8",
                                                     packages_to_install=['tensorflow==2.10', 'keras==2.10'])

In [26]:
# Build final Model

In [27]:
def build_final_model(learning_rate: float, epsilon: float, clipnorm: float,
                      preprocess_data_path: comp.InputPath(str()),
                      vectorize_data_path: comp.InputPath(str()), 
                      train_path: comp.InputPath(str())):
    
    import os
    import keras
    import tensorflow as tf
    import pickle
    
    from tensorflow.keras.layers import TextVectorization
    
    raw_test_ds = tf.data.Dataset.load(f'{preprocess_data_path}/raw_test')
    
    # Load Vectorization Layer
    from_disk = pickle.load(open(f'{vectorize_data_path}/layer.pkl', "rb"))
    vectorization_layer = TextVectorization.from_config(from_disk['config'])
    # You have to call `adapt` with some dummy data (BUG in Keras)
    vectorization_layer.adapt(tf.data.Dataset.from_tensor_slices(["xyz"]))
    vectorization_layer.set_weights(from_disk['weights'])
    
    model = keras.models.load_model(f'{train_path}/model.h5')
    
    # A string input
    inputs = tf.keras.Input(shape=(1,), dtype="string")
    # Turn strings into vocab indices
    indices = vectorization_layer(inputs)
    # Turn vocab indices into predictions
    outputs = model(indices)

    # Our end to end model
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon, clipnorm=clipnorm), 
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[tf.metrics.BinaryAccuracy(),
                           tf.keras.metrics.Precision(thresholds=0),
                           tf.keras.metrics.Recall(thresholds=0)])
    
    #creating the preprocess directory
    os.makedirs(f'/mnt', exist_ok = True)
    
    #saving the model 
    with open(f'/mnt/model.pickle', 'wb') as file:
        pickle.dump(model, file) 

In [28]:
build_final_model_op = kfp.components.create_component_from_func(build_final_model,
                                                                 output_component_file='build_final_model_component.yaml',
                                                                 base_image="python:3.8",
                                                                 packages_to_install=['tensorflow==2.10', 'keras==2.10'])

In [29]:
@dsl.pipeline(
   name='LSTM pipeline',
   description='An example pipeline that performs for a sentiment model'
)
def lstm_pipeline(
                   epochs:int, 
                   batch_size:int, 
                   es:bool,
                   hidden_layers:int, 
                   rec_units:int, 
                   dense_units:int,
                   dropout:float,
                   learning_rate: float,
                   epsilon: float,
                   clipnorm: float,
                   max_features:int,
                   embedding_type: str,
                   embedding_dim: int,
                   sequence_length: int,
                   load_data_path: str,
                   merge_data_path: str,
                   preprocess_data_path: str,
                   vectorize_data_path:str,
                   embedding_data_path:str,
                   model_path:str,
                   train_path:str,
                  ):
    
    download_IMDB_container = download_IMDB_op()
    download_FP_container = download_FP_op()
    merge_container = merge_op(download_IMDB_container.output, download_FP_container.output).after(download_IMDB_container).after(download_FP_container)
    preprocess_container = preprocess_op(merge_container.output)
    vectorize_container = vectorize_op(max_features, embedding_type, embedding_dim, sequence_length, preprocess_container.output)
    embedding_container = embedding_op(max_features, sequence_length, embedding_type, embedding_dim, vectorize_container.output)
    model_container = build_model_op(hidden_layers, rec_units, dense_units, dropout,  max_features, embedding_dim, sequence_length, learning_rate, epsilon, clipnorm, vectorize_container.output, embedding_container.output, preprocess_container.output)
    trained_container = train_op(epochs, batch_size, es, model_container.output, vectorize_container.output)
    final_container = build_final_model_op(learning_rate, epsilon, clipnorm, preprocess_container.output, vectorize_container.output, trained_container.outputs["train"]).add_pvolumes({"/mnt": dsl.PipelineVolume(pvc="model-volume")})

In [32]:
max_features = 25000
embedding_type = "glove"
embedding_dim = 300
sequence_length = 500

hidden_layers = 1
rec_units = 256
dense_units = 256
dropout = 0.5

learning_rate=3e-5 
epsilon=1e-08 
clipnorm=1.0

epochs = 100
batch_size = 512
es = True

load_data_path = "/mnt"
merge_data_path = "merge"
preprocess_data_path = "preprocess_data"
vectorize_data_path = "vectorize_data"
embedding_data_path = "embedding_data"
model_path = "model"
train_path = "train"

In [33]:
USERNAME = "user@example.com"
PASSWORD = "hsb1234#"
NAMESPACE = "kubeflow-user-example-com"
HOST = 'http://istio-ingressgateway.istio-system.svc.cluster.local:80'

session = requests.Session()
response = session.get(HOST)

headers = {
    "Content-Type": "application/x-www-form-urlencoded",
}

data = {"login": USERNAME, "password": PASSWORD}
session.post(response.url, headers=headers, data=data)
session_cookie = session.cookies.get_dict()["authservice_session"]

client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
)

arguments = {"epochs":epochs,
             "batch_size":batch_size,
             "es":es,
             "hidden_layers":hidden_layers,
             "rec_units":rec_units,
             "dense_units":dense_units,
             "dropout":dropout,
             "max_features":max_features,
             "learning_rate": learning_rate,
             "epsilon": epsilon,
             "clipnorm": clipnorm,
             "embedding_type":embedding_type,
             "embedding_dim":embedding_dim,
             "sequence_length":sequence_length,
             "vectorize_data_path":vectorize_data_path,
             "embedding_data_path":embedding_data_path,
             "load_data_path":load_data_path,
             "merge_data_path":merge_data_path,
             "preprocess_data_path":preprocess_data_path,
             "model_path":model_path,
             "train_path":train_path,
            }

client.create_run_from_pipeline_func(pipeline_func=lstm_pipeline, arguments=arguments)

RunPipelineResult(run_id=907939bd-9fc7-4f40-9720-212bb374ce20)