In [1]:
import datetime
import os
import warnings
warnings.filterwarnings("ignore")
import kfp
import kfp.dsl as dsl
from kfp.components import create_component_from_func,InputPath,OutputPath,func_to_container_op
from kubernetes.client.models import V1EnvVar

In [2]:
http_proxy = "http://hpeproxy.its.hpecorp.net:443"
https_proxy = "http://hpeproxy.its.hpecorp.net:443"

# Set the environment variables
os.environ['HTTP_PROXY'] = http_proxy
os.environ['HTTPS_PROXY'] = https_proxy

In [31]:
def read_data(output_csv:OutputPath(),
              csv_url: str) -> None:
    import pandas as pd
    df=pd.read_csv(csv_url)
    print(df.head())
    with open(output_csv,"w") as f:
        df.to_csv(f,index=False)


def preprocess_data(input_data: InputPath(),
                    output_csv:OutputPath(),
                    scale_factor:OutputPath(),
                    start_date: str ,
                    end_date: str) -> None:
    import pandas as pd
    from sklearn.preprocessing import MinMaxScaler
    import pickle
    with open(input_data) as f:
        df = pd.read_csv(f)
    
    df = df.loc[(df["Date"]>=start_date) & (df["Date"]<=end_date)]  
    df['Date'] = pd.to_datetime(df['Date'],format = "%Y-%m-%d").dt.date
    df=df.set_index("Date")
    df.index.freq = 'D' 
    
    date_range = pd.DataFrame(pd.date_range(start=start_date, end=end_date),columns=["Date"])
    date_range['Date'] = date_range['Date'].dt.date
    date_range=date_range.set_index("Date")
    date_range.index.freq = 'D'
    
    df = pd.merge(date_range, df, how='left',left_index=True, right_index=True).fillna(method = 'ffill')
    df.dropna(inplace=True)
    df=df[['Open','Close']]
    print(df.head())
    
    scale = MinMaxScaler()
    df[df.columns] = scale.fit_transform(df)
    with open(scale_factor,'wb') as f:
        pickle.dump(scale,f)
    
    with open(output_csv,"w") as f:
        df.to_csv(f,index=False)
        
    print("preprocessed the data and scale object saved")    
    

def get_train_test_data(input_data: InputPath(),
                        train_percent:float,
                        train_data: OutputPath(),
                        test_data: OutputPath()) -> None:
    
    import pandas as pd
    with open(input_data) as f:
        df = pd.read_csv(f)
        
    training_size = round(len(df) * train_percent)
    train, test = df[:training_size], df[training_size:]
    print(train.head())
    print(test.head())
            
    with open(train_data,"w") as f:
        train.to_csv(f,index=False)  

    with open(test_data,"w") as f:
        test.to_csv(f,index=False) 

        
def create_sequence(input_data: InputPath(),
                    seq_data: OutputPath(),
                    label_data: OutputPath()) -> None:
    
    import pandas as pd
    import numpy as np
    
    with open(input_data,'r') as f:
        dataset = pd.read_csv(input_data)
        
    sequences = []
    labels = []
    start_idx = 0

    for stop_idx in range(60,len(dataset)): 
        sequences.append(dataset.iloc[start_idx:stop_idx])
        labels.append(dataset.iloc[stop_idx])
        start_idx += 1
    
    with open(seq_data,'wb') as f:
        np.save(f,np.array(sequences))
        
    with open(label_data,'wb') as f:
        np.save(f,np.array(labels))    
        
        

def model_training(
                   train_seq_data: InputPath(),
                   train_label_data: InputPath(),
                   test_seq_data: InputPath(),
                   test_label_data: InputPath(),
                   loss: str,
                   optimizer: str,
                   metrics: str,
                   epochs:int,
                   trained_model: OutputPath()) -> None:
    
    print("loading libraries")
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, LSTM
    import pickle
    import numpy as np
    
    print("loaded libraries")
    
    with open(train_seq_data,'rb') as f:
        train_seq = np.load(f)
    with open(train_label_data,'rb') as f:
        train_label = np.load(f)  
    with open(test_seq_data,'rb') as f:
        test_seq = np.load(f)
    with open(test_label_data,'rb') as f:
        test_label = np.load(f)
        
        
    model = Sequential()
    model.add(LSTM(units=60, return_sequences=True, input_shape = (train_seq.shape[1],train_seq.shape[2])))
    model.add(Dropout(0.1)) 
    model.add(LSTM(units=60))
    model.add(Dense(2))
    model.compile(loss=loss, optimizer=optimizer, metrics=[metrics])
    
    print("Training the model...")
    model.fit(train_seq, train_label, epochs=epochs ,validation_data=(test_seq, test_label), verbose=1)
    print("Training done")
    
    with open(trained_model,'wb') as f:
        pickle.dump(model,f)

        
def model_evaluate(trained_model:InputPath(),
                   test_seq_data:InputPath(),
                   test_label_data: InputPath(),
                   scale_factor:InputPath(),
                   model_metrices: OutputPath())-> None:
    
    import pickle
    import numpy as np
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    with open(trained_model,'rb') as f:
        model = pickle.load(f)   
    with open(scale_factor,'rb') as f:
        scale = pickle.load(f) 
    with open(test_seq_data,'rb') as f:
        test_seq = np.load(f)
    with open(test_label_data,'rb') as f:
        test_label = np.load(f)
        
     
    pred = model.predict(test_seq)
    pred = scale.inverse_transform(pred)
    label = scale.inverse_transform(test_label)
    
    mae = mean_absolute_error(label, pred)
    rmse = mean_squared_error(label, pred, squared=False)
    
    metric = {"Mean Absolute Error":mae,"Root Mean Square Error":rmse}
    print(metric)

    with open(model_metrices,'wb') as f:
        pickle.dump(metric,f)

    
    

def logg_env_function():
    import os
    import logging
    logging.basicConfig(level=logging.INFO)
    env_http = os.getenv('HTTP_PROXY')
    env_https = os.getenv('HTTPS_PROXY')
    logging.info('The environment variable is: {}'.format(env_http,env_https))
        

logg_env_function_op = create_component_from_func(logg_env_function, base_image='python:3.8')


step_read_data=create_component_from_func(
    func = read_data,
    base_image='python:3.8',
    packages_to_install=['pandas==1.2.4']
)

step_preprocess_data=create_component_from_func(
    func = preprocess_data,
    base_image='python:3.8',
    packages_to_install=['pandas==1.2.4','scikit-learn==0.24.2']
)

step_get_train_test_data=create_component_from_func(
    func = get_train_test_data,
    base_image='python:3.8',
    packages_to_install=['pandas==1.2.4']
)

step_create_sequence=create_component_from_func(
    func = create_sequence,
    base_image='python:3.8',
    packages_to_install=['pandas==1.2.4','numpy==1.22.4']
)

step_model_training=create_component_from_func(
    func = model_training,
    base_image='python:3.8',
    packages_to_install=['numpy==1.22.4','tensorflow-cpu==2.7.4']
)

step_model_evaluate=create_component_from_func(
    func = model_evaluate,
    base_image='python:3.8',
    packages_to_install=['numpy==1.22.4','scikit-learn==0.24.2','tensorflow-cpu==2.7.4']
)


In [30]:
@dsl.pipeline(
  name='Stock Price Prediction Pipeline',
  description='Stock Price Prediction Pipeline for POC'
)
def stock_pipeline(csv_url:str,
                   start_date:str,
                   end_date:str,
                   train_percent:float,
                   loss: str,
                   optimizer: str,
                   metrics: str,
                   epochs:int):
    

    env_var_http = V1EnvVar(name='HTTP_PROXY', value='http://hpeproxy.its.hpecorp.net:443')
    env_var_https = V1EnvVar(name='HTTPS_PROXY', value='http://hpeproxy.its.hpecorp.net:443')   
    
    
    container_op1 = logg_env_function_op().add_env_variable(env_var_https).add_env_variable(env_var_http)
    
    #STEP1: Task to read the file from source
    read_data_task = step_read_data(csv_url=csv_url).add_env_variable(env_var_https).add_env_variable(env_var_http)

    
    #STEP2: Task to preprocess the data 
    preprocess_data_task = step_preprocess_data(read_data_task.outputs['output_csv'],
                                                start_date=start_date,
                                                end_date=end_date).add_env_variable(env_var_https).add_env_variable(env_var_http)
    
    preprocess_data_task.after(read_data_task)
    
    
    #STEP3: Task to split the data into training and test
    train_test_task = step_get_train_test_data(preprocess_data_task.outputs['output_csv'],
                                               train_percent=train_percent).add_env_variable(env_var_https).add_env_variable(env_var_http) 
    
    train_test_task.after(preprocess_data_task)
    
    
    #STEP4 - a: Task to create the input sequence and output label from training data
    train_seq_task = step_create_sequence(train_test_task.outputs['train_data']).add_env_variable(env_var_https).add_env_variable(env_var_http)
    train_seq_task.after(train_test_task)
    
    #STEP4 - b: Task to create the input sequence and output label from test data
    test_seq_task = step_create_sequence(train_test_task.outputs['test_data']).add_env_variable(env_var_https).add_env_variable(env_var_http)
    test_seq_task.after(train_test_task)
    
 
    #STEP5: Task to define and train the model
    task_model_training = step_model_training(train_seq_task.outputs['seq_data'],
                                              train_seq_task.outputs['label_data'],
                                              test_seq_task.outputs['seq_data'],
                                              test_seq_task.outputs['label_data'],
                                              loss = loss,
                                              optimizer = optimizer,
                                              metrics = metrics,
                                              epochs = epochs
                                             ).add_env_variable(env_var_https).add_env_variable(env_var_http) 
    
    task_model_training.after(train_seq_task)
    task_model_training.after(test_seq_task)
    
    
    #STEP6: Task to evaluate the model on different metrices
    model_evaluate_task = step_model_evaluate(task_model_training.outputs['trained_model'],
                                              test_seq_task.outputs['seq_data'],
                                              test_seq_task.outputs['label_data'],
                                              preprocess_data_task.outputs['scale_factor']
                                             ).add_env_variable(env_var_https).add_env_variable(env_var_http) 
    
    model_evaluate_task.after(task_model_training)
    
    # To disable cache
    # read_data_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # preprocess_data_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # train_test_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # train_seq_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # test_seq_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # task_model_training.execution_options.caching_strategy.max_cache_staleness = "P0D"
    # model_evaluate_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    
    
    

experiment_name = "stock-pipeline "+str(datetime.datetime.now().date())
run_name = stock_pipeline.__name__+'_run'
namespace='hpedemo-user01'

arguments = {"csv_url":"https://raw.githubusercontent.com/snairharikrishnan/test/main/ASIANPAINT.csv",
             "start_date":"2014-01-01",
             "end_date":"2019-12-31",
             "train_percent":"0.80",
             "loss":"mean_squared_error",
             "optimizer":"adam",
             "metrics":"mean_absolute_error",
             "epochs":"5"
            }

client = kfp.Client()
run_result = client.create_run_from_pipeline_func(pipeline_func=stock_pipeline,
                                                  experiment_name=experiment_name,
                                                  run_name=run_name,
                                                  arguments=arguments
                                                 )