# This pipeline is meant to be run on a kubernetes cluster only, Do not run on your local machine directly to avoid causing complications in your development environment

# Import the needed libraries

In [None]:
!python -m pip install --user --upgrade pip

!python -m spacy download en_core_web_sm

!pip3 install pandas==0.24.2 matplotlib==3.2.2 scipy==1.4.1 statsmodels==0.12.0 scikit-learn==0.23.1 tensorflow==2.1.0 keras==2.3.1 --user

In [None]:
# Install or Upgrade if present the KFP library

!pip3 install kfp --upgrade --user

In [None]:
#Check if the install was successful

!which dsl-compile

In [None]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

# where the outputs are stored
data_path = "pipe_data"

# Data Injestion

In [None]:
def data_injestion(data_path):
    import pickle
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])

    import pandas as pd
    
    data = pd.read_csv('https://raw.githubusercontent.com/Chizzy-codes/g02-sms-spam/master/data/spam.csv', usecols=['v1', 'v2'],  encoding = 'latin-1')
    data2 = pd.read_csv('https://raw.githubusercontent.com/Chizzy-codes/g02-sms-spam/master/data/spam_additional.csv', usecols=['Text', 'v3'],  encoding = 'latin-1')
  
    #Save the injested data as a pickle file to be used by the data tranformation component.
    with open(f'{data_path}/inj_data', 'wb') as f:
        pickle.dump((data, data2), f)

In [None]:
idata = data_injestion(data_path)

# Data Transformation

In [None]:
def data_transformation(data_path):
    
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'spacy'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'keras'])
    
    from math import sqrt
    from numpy import concatenate
    from pandas import read_csv, DataFrame, concat
    from sklearn.preprocessing import MinMaxScaler, LabelEncoder
    from sklearn.metrics import mean_squared_error
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.utils import to_categorical
    from tensorflow.keras import regularizers

    import pandas as pd
    import numpy as np
    import pickle
    
    embedding_dim = 100
    max_length = 150
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    
    # Load and unpack the test_data
    
    with open(f'{data_path}/inj_data','rb') as f:
        data, data2 = pickle.load(f)
    
    
    nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
    spacy_stop_words = spacy.lang.en.STOP_WORDS # getting spacy's stop-words
    
    #Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
    # Convert it to a Python list and paste it here
    stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    stop_words = list(set(list(spacy_stop_words) + stopwords))
    stopwords = list({word.lemma_.lower() for word in nlp(' '.join(stop_words))})
    
    # defining tokenzer function to tokenize the lower case lemma of documents in a corpus and 
    # filter out stop-words  
    def tokenizer_spacy(text):
        return [word.lemma_.lower() for word in nlp(text) if word.is_alpha and word.lemma_.lower() not in stopwords]
    
    #Replace the target colunms with binary number
    data.replace({'ham':1,'spam':0},inplace=True)
    
    sentences = data['message']
    labels=data['label']
    
    #Replace the target colunms with binary number
    data.replace({'ham':1,'spam':0},inplace=True)
    
    senten = [word for word in sentences if word not in stopwords] # stopword filtering
    
    # tokenize (lemmatize and filter stop words) corpus 
    senten = [' '.join(tokenizer_spacy(doc)) for doc in senten]
    
    # word tokenizing
    tokenizer = Tokenizer(oov_token=oov_tok)
    tokenizer.fit_on_texts(senten)

    word_index = tokenizer.word_index
    
    vocab_size=len(word_index)

    # padding and converting to numeric sequence
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    
    padded = np.array(padded)
    labels = np.array(labels)
    
    #Save the preprocessed data as a pickle file to be used by the model component.
    with open(f'{data_path}/preprocessed_data', 'wb') as f:
        pickle.dump((padded,labels), f)

In [None]:
new_data = data_transformation(data_path)

# Model Prediction

In [None]:
def predict(data_path):
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'keras==2.3.1'])
    
    import pickle
    import pandas as pd
    import numpy as np
    
    from sklearn.metrics import f1_score, precision_score

    
    from keras.models import load_model
    
    model = load_model('model2.h5')
    
     # Load and unpack the test_data
    
    with open(f'{data_path}/preprocessed_data','rb') as f:
        padded, labels = pickle.load(f)
    
    prediction = model.predict(padded)
       
    prediction = [int(np.round(i)) for i in prediction]
           
    with open(f'{data_path}/result', 'wb') as f:
        pickle.dump(prediction, f)
    
    print('Successfully!')

In [None]:
predicted = predict(data_path)

# Showing Result

In [None]:
def result(data_path):
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.24.2'])
    import pandas as pd
    from math import sqrt
    import numpy as np
    import pickle
    from keras.models import load_model
    
    with open(f'{data_path}/preprocessed_data','rb') as f:
        padded, labels = pickle.load(f)
    
    
    with open(f'{data_path}/result', 'rb') as f:
        prediction = pickle.load(f)
    
    f1 = 'F1 score: {:.4f}'.format(f1_score(labels, prediction))

    precision = 'Precision_score: {:.4f}'.format(precision_score(labels, prediction))
    
    evaluation = [f1, precision]
    
    with open(f'{data_path}/result.txt', 'w') as f:
        f.write(" Evaluation: {}".format(evaluation))
        f.write("\n")
        f.write(" Predictions: {}".format(prediction))
        
        
    print('Done!!!')


In [None]:
get = result(data_path)

In [None]:
# Create components.
inj_op = comp.func_to_container_op(data_injestion , base_image = "tensorflow/tensorflow:latest-gpu-py3")
transformation_op = comp.func_to_container_op(data_transformation, base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict, base_image = "tensorflow/tensorflow:latest-gpu-py3")
result_op = comp.func_to_container_op(result, base_image = "tensorflow/tensorflow:latest-gpu-py3")

# Build Kubeflow Pipelines

In [None]:
#Create a client to enable communication with the Pipelines API server.
client = kfp.Client()

In [None]:
# Define the pipeline
@dsl.pipeline(
    name='Sms Spam Classification Pipeline',
    description=
    'A machine learning pipeline that makes predictions on whether on not the sms content of a csv file is spam or not.'
)
# Define parameters to be fed into pipeline
def spam_pipeline(data_path: str):

    # Define volume to share data between components.
    vop = dsl.VolumeOp(name="create_volume",
                       resource_name="data-volume",
                       size="1Gi",
                       modes=dsl.VOLUME_MODE_RWO)

    # Create data injestion component.
    injestion_container = inj_op(data_path) \
                                    .add_pvolumes({data_path: vop.volume})

    # Create data transformation component.
    transformation_container = transformation_op(data_path) \
                                    .add_pvolumes({data_path: injestion_container.pvolume})
    # Create model training component.
    predict_container = predict_op(data_path) \
                                    .add_pvolumes({data_path: transformation_container.pvolume})
    
    # Create model validation component.
    result_container = result_op(data_path) \
                                    .add_pvolumes({data_path: predict_container.pvolume})
    

    # Print the result of the prediction
    validation_result_container = dsl.ContainerOp(
        name="print_validation_result",
        image='library/bash:4.4.23',
        pvolumes={data_path: result_container.pvolume},
        arguments=['cat', f'{data_path}/result.txt'])

In [None]:
DATA_PATH = data_path

pipeline_func = spam_pipeline

In [None]:
experiment_name = 'spam_classification_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)