<a href="https://colab.research.google.com/github/Jyoti1706/Data-Science-Project/blob/master/LSTM_TS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from numpy.random import seed
seed(1)

In [27]:
def prepare_data(trainpath, testpath):
    import pandas as pd
    train = pd.read_csv('data/train.csv', parse_dates=['date'])
    test = pd.read_csv('data/test.csv', parse_dates=['date'])
    train.to_csv(f'data/train_df.csv', index=False)
    test.to_csv(f'data/test_df.csv', index=False)
    print("\n ---- data csv is saved to PV location /data/train_df.csv ----")
    daily_sales = train.groupby('date', as_index=False)['sales'].sum()
    store_daily_sales = train.groupby(['store', 'date'], as_index=False)['sales'].sum()
    item_daily_sales = train.groupby(['item', 'date'], as_index=False)['sales'].sum()
    store_daily_sales_sc = []
    for store in store_daily_sales['store'].unique():
      current_store_daily_sales = store_daily_sales[(store_daily_sales['store'] == store)]
    item_daily_sales_sc = []
    for item in item_daily_sales['item'].unique():
        current_item_daily_sales = item_daily_sales[(item_daily_sales['item'] == item)]
    train = train[(train['date'] >= '2017-01-01')]
    train_gp = train.sort_values('date').groupby(['item', 'store', 'date'], as_index=False)
    train_gp = train_gp.agg({'sales':['mean']})
    train_gp.columns = ['item', 'store', 'date', 'sales']
    train_gp.head()
    def series_to_supervised(data, window=1, lag=1, dropnan=True):
        cols, names = list(), list()
        # Input sequence (t-n, ... t-1)
        for i in range(window, 0, -1):
            cols.append(data.shift(i))
            names += [('%s(t-%d)' % (col, i)) for col in data.columns]
        # Current timestep (t=0)
        cols.append(data)
        names += [('%s(t)' % (col)) for col in data.columns]

        # Target timestep (t=lag)
        cols.append(data.shift(-lag))
        names += [('%s(t+%d)' % (col, lag)) for col in data.columns]

        # Put it all together
        agg = pd.concat(cols, axis=1)
        agg.columns = names

        # Drop rows with NaN values
        if dropnan:
            agg.dropna(inplace=True)
        return agg
    window = 29
    lag_size = (test['date'].max().date() - train['date'].max().date()).days
    lag = lag_size
    series = series_to_supervised(train_gp.drop('date', axis=1), window=window, lag=lag)
    series.head()
    last_item = 'item(t-%d)' % window
    last_store = 'store(t-%d)' % window
    series = series[(series['store(t)'] == series[last_store])]
    series = series[(series['item(t)'] == series[last_item])]
    columns_to_drop = [('%s(t+%d)' % (col, lag)) for col in ['item', 'store']]
    for i in range(window, 0, -1):
        columns_to_drop += [('%s(t-%d)' % (col, i)) for col in ['item', 'store']]
    series.drop(columns_to_drop, axis=1, inplace=True)
    series.drop(['item(t)', 'store(t)'], axis=1, inplace=True)
    series.to_csv(f'data/final_df.csv', index=False)


In [28]:
#Train Test Split
# Label
def train_test_split():
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    print("---- Inside train_test_split component ----")
    series = pd.read_csv(f'data/final_df.csv')
    train = pd.read_csv(f'data/train_df.csv', parse_dates=['date'])
    test = pd.read_csv(f'data/test_df.csv', parse_dates=['date'])
    lag_size = (test['date'].max().date() - train['date'].max().date()).days
    labels_col = 'sales(t+%d)' % lag_size
    labels = series[labels_col]
    series = series.drop(labels_col, axis=1)

    X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.4, random_state=0)
    print('Train set shape', X_train.shape)
    print('Validation set shape', X_valid.shape)
    np.save(f'data/X_train.npy', X_train)
    np.save(f'data/X_valid.npy', X_valid)
    np.save(f'data/y_train.npy', Y_train)
    np.save(f'data/y_valid.npy', Y_valid)

In [29]:
# Training LSTM Model
def trainModel():
    import pandas as pd
    import numpy as np
    from tensorflow import keras
    from tensorflow.keras import optimizers
    from tensorflow.keras.models import Sequential, Model
    from tensorflow.keras.layers import Dense, LSTM
    epochs = 40
    lr = 0.0003
    adam = optimizers.Adam(lr)
    X_train = np.load(f'data/X_train.npy',allow_pickle=True)
    y_train = np.load(f'data/y_train.npy',allow_pickle=True)
    X_valid = np.load(f'data/X_valid.npy',allow_pickle=True)
    y_valid = np.load(f'data/y_valid.npy',allow_pickle=True)
    X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
    print('Train set shape', X_train_series.shape)
    print('Validation set shape', X_valid_series.shape)
    model_lstm = Sequential()
    model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
    model_lstm.add(Dense(1))
    model_lstm.compile(loss='mse', optimizer=adam)
    #model_lstm.summary()
    lstm_history = model_lstm.fit(X_train_series, y_train, validation_data=(X_valid_series, y_valid), epochs=epochs, verbose=2)
    model_lstm.save(f'data/model_lstm.h5')


In [30]:
def predict_on_test_data():
    import pandas as pd
    import numpy as np
    import pickle
    print("---- Inside predict_on_test_data component ----")
    X_train = np.load(f'data/X_train.npy',allow_pickle=True)
    y_train = np.load(f'data/y_train.npy',allow_pickle=True)
    X_valid = np.load(f'data/X_valid.npy',allow_pickle=True)
    y_valid = np.load(f'data/y_valid.npy',allow_pickle=True)
    X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
    model_lstm = keras.models.load_model(f'data/model_lstm.h5')
    lstm_train_pred = model_lstm.predict(X_train_series)
    lstm_valid_pred = model_lstm.predict(X_valid_series)
    print('Train rmse:', np.sqrt(mean_squared_error(y_train, lstm_train_pred)))
    print('Validation rmse:', np.sqrt(mean_squared_error(y_valid, lstm_valid_pred)))

# KF Pipeline

In [43]:
#pip install kfp==1.8.22

In [31]:
import kfp
import kfp.components as comp
import requests
import kfp.dsl as dsl

In [32]:
create_step_prepare_data = kfp.components.create_component_from_func(
    func=prepare_data,
    base_image='python:3.7',
    packages_to_install=['pandas==1.2.4','numpy==1.21.0', 'scikit-learn==0.24.2']
)
create_step_train_test_split = kfp.components.create_component_from_func(
    func=train_test_split,
    base_image='python:3.7',
    packages_to_install=['pandas==1.2.4','numpy==1.21.0','scikit-learn==0.24.2']
)
create_step_training_model = kfp.components.create_component_from_func(
    func=trainModel,
    base_image='python:3.7',
    packages_to_install=['pandas==1.2.4','numpy==1.21.0','scikit-learn==0.24.2']
)
create_step_predict_on_test_data = kfp.components.create_component_from_func(
    func=predict_on_test_data,
    base_image='python:3.7',
    packages_to_install=['pandas==1.2.4','numpy==1.21.0','scikit-learn==0.24.2']
)

In [40]:
# Define the pipeline
@dsl.pipeline(
   name='Time Series Forecasting Kubeflow Demo Pipeline',
   description='A sample pipeline that performs Time Series Forecasting'
)
# Define parameters to be fed into pipeline
def TimeSeries_LSTM_pipeline(data_path: str):
    vop = dsl.VolumeOp(
    name="t-vol",
    resource_name="t-vol",
    size="2Gi",
    modes=dsl.VOLUME_MODE_RWO)

    prepare_data_task = create_step_prepare_data(    testpath = "data/test.csv",
    trainpath = "data/train.csv").add_pvolumes({data_path: vop.volume})
    train_test_split = create_step_train_test_split().add_pvolumes({data_path: vop.volume}).after(prepare_data_task)
    Model_training = create_step_training_model().add_pvolumes({data_path: vop.volume}).after(train_test_split)
    log_predicted_class = create_step_predict_on_test_data().add_pvolumes({data_path: vop.volume}).after(Model_training)


    prepare_data_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    train_test_split.execution_options.caching_strategy.max_cache_staleness = "P0D"
    Model_training.execution_options.caching_strategy.max_cache_staleness = "P0D"
    log_predicted_class.execution_options.caching_strategy.max_cache_staleness = "P0D"



In [42]:
kfp.compiler.Compiler().compile(
    pipeline_func=TimeSeries_LSTM_pipeline,
    package_path='TimeSeries_LSTM_pipeline1.yaml')