In [None]:
"""
UNCOMMENT BELOW CODES AND RUN
"""

In [1]:
#!python -m pip install --user --upgrade pip
#!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.3 keras==2.4.3 --user

In [2]:
#!pip3 install kfp --upgrade --user

In [3]:
#!pip install statsmodels

# Pipeline Archetecture
## Components
1. Data Ingestion
2. Data Wrangling
3. Data Cleaning
4. Data Preprocessing
5. Data Modeling
6. Model Deploy
6. Model Servicing
6. Evaluation metrics

# Kubeflow Output Directory

In [4]:
!pwd

/home/jovyan


In [5]:
output_dir = "/home/jovyan/stage-f-05-electric-power/data/out"
#input_path= "https://storage.cloud.google.com/chrisc-bucket/household_power_consumption.zip"

# Data Ingestion Function

In [6]:
def data_ingestion(input_path, output_path):
    import subprocess
    import pickle
    import sys
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "scikit-learn"])

    import pandas as pd
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_row', None)
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    
    
    uci_path = "http://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip"
    download_url = input_path if input_path else uci_path
    
    subprocess.run(["wget", "-O", "household_power_consumption.zip", download_url])
    subprocess.run(["unzip", "household_power_consumption.zip"])
    subprocess.run(["unzip", "*.zip"])
    
    print("======Successfully Unzipped the dataset file======")
    
    subprocess.call(["rm", "-r", "*.zip"])
    data_load = pd.read_csv("household_power_consumption.txt", delimiter=';', parse_dates = {"datetime":[0,1]},
                 low_memory = False, infer_datetime_format=True)
    
    print("======Successfully loaded the dataset file======")
    
    with open(f"{output_path}/datai.pkl", "wb") as data:
        pickle.dump(data_load, data)
    
    
    print("=====Data Ingestion Finished Successfully=======")
    return data_load.shape
    

In [7]:
data_ingestion("",output_dir)



(2075259, 8)

# Data Wrangling

In [8]:
def data_wrangling(output_path):
    import subprocess, pickle, sys
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "numpy", "scikit-learn"])
    
    import numpy as np
    import pandas as pd
    
    with open(f"{output_path}/datai.pkl", 'rb') as f:
        data = pickle.load(f)
    print(data.columns)
        
    print("=====Renaming the dataset columns=====")   
    
    cols = {'Global_active_power': 'Global_active_power(KW)',
            'Global_reactive_power':'Global_reactive_power(KW)',
            'Voltage': 'Voltage(V)',
            'Global_intensity': 'Global_intensity(Ampere)',
            'Sub_metering_1': 'Sub_metering_1(WH)',
            'Sub_metering_2': 'Sub_metering_2(WH)',
            'Sub_metering_3': 'Sub_metering_3(WH)'
       }
    data.rename(cols, axis = 1, inplace = True)
    print("=====Dataset columns rename successfully=====","\n")
    
    print("....................................................................")
    print("=====Setting the datetime column to pandas datetime=====")
    print("....................................................................")
    
    data["datetime"]  = pd.to_datetime(data["datetime"])
    data["Year"] = data["datetime"].apply(lambda X:X.year)
    data["Quarter"] = data["datetime"].apply(lambda X:X.quarter)
    data["Month"]  = data["datetime"].apply(lambda X:X.month)
    data['Month_name'] = data["datetime"].apply(lambda X:X.month_name())
    data["dayofweek"] = data["datetime"].apply(lambda X:X.weekday())
    data['day_name'] = data['datetime'].apply(lambda X:X.day_name())
    data = data.set_index("datetime")
    
    print("=====Datetime column successfully converted to pandas datetime=====")
    print("................................................................... ","\n")
    
    print(f"The number of rows and columns is now {data.shape[0]} and {data.shape[1]} respectively","\n")
    
    print("===Unique values in Year, Quarter, Month, Month_name, dayofweek and day_name columns===")
    print("Year column unique values:", data["Year"].unique())
    print("Quater column unique values:",data["Quarter"].unique())
    print("Month column unique values:", data["Month"].unique())
    print("Month_name column unique values:", data['Month_name'].unique())
    print("dayofweek column unique values:", data['dayofweek'].unique())
    print("day_name column unique values:", data['day_name'].unique())
    
    with open(f"{output_path}/dataw.pkl", "wb") as wrangled_data:
        pickle.dump(data,wrangled_data)
    
    print("=====Data Wrangling Finished Successfully=======")
    return data.shape
    

In [9]:
data_wrangling(output_dir)

Index(['datetime', 'Global_active_power', 'Global_reactive_power', 'Voltage',
       'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')
=====Renaming the dataset columns=====
=====Dataset columns rename successfully===== 

....................................................................
=====Setting the datetime column to pandas datetime=====
....................................................................
=====Datetime column successfully converted to pandas datetime=====
...................................................................  

The number of rows and columns is now 2075259 and 13 respectively 

===Unique values in Year, Quarter, Month, Month_name, dayofweek and day_name columns===
Year column unique values: [2006 2007 2008 2009 2010]
Quater column unique values: [4 1 2 3]
Month column unique values: [12  1  2  3  4  5  6  7  8  9 10 11]
Month_name column unique values: ['December' 'January' 'February' 'March' '

(2075259, 13)

# Data Cleaning
1. Handling Null entries
2. Checking and droping duplicate entries

In [10]:
def data_cleaning(output_path):
    import subprocess, sys, pickle
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "scikit-learn"])
    
    import numpy as np
    import pandas as pd
    
    with open(f"{output_path}/dataw.pkl", "rb") as dataset:
        data = pickle.load(dataset)
    
    #Replace ? with np.nan
    data.replace('?', np.nan, inplace = True)
    colums = ['Global_active_power(KW)', 'Global_reactive_power(KW)', 'Voltage(V)',
              'Global_intensity(Ampere)','Sub_metering_1(WH)', 'Sub_metering_2(WH)', 'Sub_metering_3(WH)']
    for cols in colums:
        print('==========',cols,'==========')
        data[cols].fillna(value = 1, inplace = True)
    pass
    
    print("=====Checking for Duplicated Entries=====","\n")
    print(f'Duplicate check shows there are some dupliacte entries with shape: {data[data.duplicated()].shape}')
    
    print("=====Dropping Duplicated Entries=====","\n")
    
    data = data.drop_duplicates()
    print("=====Duplicated Entries Successfully Dropped=====")
    
    with open(f"{output_path}/datac.pkl", "wb") as cleaned_data:
        pickle.dump(data, cleaned_data)
    
    print("=====Data Cleaning Finished Successfully=======")
    return data.shape
    

In [11]:
data_cleaning(output_dir)

=====Checking for Duplicated Entries===== 

Duplicate check shows there are some dupliacte entries with shape: (56666, 13)
=====Dropping Duplicated Entries===== 

=====Duplicated Entries Successfully Dropped=====


(2018593, 13)

# EDA and Stationarity Check
1. EDA
2. Visualization
3. ADF Test

In [12]:
def stationarity_check(output_path):
    import sys, pickle, subprocess
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "matplotlib", "scikit-learn"])
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    #import seaborn as sns
    plt.style.use('ggplot')
    plt.rcParams['font.size'] = 10
    
    #Time series libraries
    from scipy import stats
    from statsmodels.tsa.stattools import adfuller, acf, pacf
    from statsmodels.graphics.tsaplots import pacf, plot_acf,plot_pacf
    from pandas.plotting import autocorrelation_plot
    from statsmodels.tsa.arima_model import  ARIMA
    import statsmodels.api as sm
    
    with open(f"{output_path}/datac.pkl", "rb") as dataset:
        data = pickle.load(dataset)
        
    data_col = data['Global_active_power(KW)']
    data_col = data_col.astype('float')
    original_timeseries = data_col
    rolling_mean = data_col.rolling(window = 30).mean()
    rolling_var =  data_col.rolling(window = 30).var()
    rolling_std =  data_col.rolling(window = 30).std()

    print("==========Plotting of Window Rollings Plots==========",'\n')
#     plt.figure(figsize = (15,10))
#     sns.despine(left = True)
#     original_plot = plt.plot(original_timeseries, label = "Original Timeseries", color = 'purple', marker = 'o')
#     rolling_mean_plot = plt.plot(rolling_mean, color = 'red', label = "rolling_mean", marker = 'x')
#     rolling_var_plot = plt.plot(rolling_var,color = 'green', label = 'rolling variance', marker = '^')
#     rolling_std_plot = plt.plot(rolling_std, label = "rolling_std", color = 'cyan',marker = '*' )
  
#     plt.legend(loc='best'); plt.title('Rolling of Mean & Standard Deviation')
#     plt.show()

    print('\n',"==========Augument Dickey Fuller Test==========")
    print("===The result show the TS is stationary because the ADF TEST STATISTICS value is lower than the Critical values===")
    adf_result = adfuller(original_timeseries.values, autolag = 'AIC')
    adf_output = pd.Series(adf_result[0:4], index= ['ADF Statistic','p-value','#Lags Used','Total used observations'])

    for key, value in adf_result[4].items():
        adf_output['Critical Value (%s)'%key] = value
    print(adf_output)
    
    with open(f"{output_path}/datats.pkl", "wb") as eda_dataset:
        pickle.dump(data, eda_dataset)
        
    print("=====Stationarity check Finished Successfully=======")    
    return data.shape


In [13]:
stationarity_check(output_dir)



===The result show the TS is stationary because the ADF TEST STATISTICS value is lower than the Critical values===


(2018593, 13)

# Data Preprocessing

1. Feature Engineering
2. Normalization

In [14]:
def data_preprocessing(output_path):
    import subprocess, sys, pickle
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "matplotlib", "scikit-learn"])
    
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.metrics import accuracy_score, mean_squared_error
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
    plt.style.use('ggplot')
    plt.rcParams['font.size'] = 10
    
    with open(f"{output_path}/datats.pkl", "rb") as dataset:
        data = pickle.load(dataset)
        
    """
    Since we are using a Recurrent Neural network, it is better to use MinMaxScaler
    """
    data = data['Global_active_power(KW)'].values.reshape(-1,1)
    scaler = MinMaxScaler(feature_range = (0,1))
    
    # The fit_transform method converts the data to numpy array stored in dataset variable
    dataset = scaler.fit_transform(data) 
    print("The type of dataset is {}". format(type(dataset)))
    print("============================Scaled dataset====================",'\n')
    print(dataset[:5])
    print("============================Scaled dataset====================",'\n')
    
    with open(f"{output_path}/datapr.pkl", "wb") as processed_data:
        pickle.dump(dataset, processed_data)
    
    print("=====Preprocessing Finished Successfully=======")    
    return pd.DataFrame(dataset).shape

In [15]:
data_preprocessing(output_dir)

The type of dataset is <class 'numpy.ndarray'>

[[0.37479631]
 [0.47836321]
 [0.47963064]
 [0.48089806]
 [0.32500453]]



(2018593, 1)

# Data Modelling

In [16]:
def data_modelling(output_path):
    import subprocess, sys, pickle
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "scikit-learn","tensorflow==2.3", "keras==2.4.3"])
    
    #Modelling libaries
    import numpy as np
    import tensorflow as tf
    from tensorflow import keras
    import keras
    from keras.models import Sequential
    from keras.layers import Dropout, Dense, LSTM
    from keras.callbacks import EarlyStopping
    
    with open(f"{output_path}/datapr.pkl", "rb") as data:
        dataset = pickle.load(data)
        
    train_size = int(len(dataset)*0.8)
    test_size = int(len(dataset)*0.2)

    train, test = dataset[:train_size,:], dataset[train_size:len(dataset),:]


    print("=====Converting an array of values into a dataset matrix done successfully=====",'\n')
    def create_dataset(dataset, look_back):
        X, Y = [], []
        for i in range(len(dataset)-look_back-1):
            x1 = dataset[i:(i+look_back), 0]
            X.append(x1)
            Y.append(dataset[i + look_back, 0])
        return np.array(X), np.array(Y)
    
    X_train, y_train = create_dataset(train, 30)
    X_test, y_test = create_dataset(test,30)

    print("====Reshaping input to be [samples, no_features, time steps] done successfully====",'\n')
    X_train = X_train.reshape(X_train.shape[0],1,X_train.shape[1])
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    model = Sequential()
    model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')

    history = model.fit(X_train, y_train, epochs=5, batch_size=70, validation_data=(X_test, y_test), 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=1, shuffle=False)

    print(model.summary())

    #Save the model to the designated 
    model.save(f'{output_path}/lstm_model.h5')
    
    with open(f"{output_path}/train_data", "wb") as training_data:
        pickle.dump((X_train,y_train), training_data)
    
    with open(f"{output_path}/test_data", "wb") as testing_data:
        pickle.dump((X_test,y_test), testing_data)
        
    return "=====Data Modelling Finished Successfully======="

In [17]:
data_modelling(output_dir)

=====Converting an array of values into a dataset matrix done successfully===== 

====Reshaping input to be [samples, no_features, time steps] done successfully==== 

(1614843, 1, 30) (403688, 1, 30) (1614843,) (403688,)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100)               52400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 52,501
Trainable params: 52,501
Non-trainable params: 0
_________________________________________________________________
None




# Prediction/Forecasting

In [18]:
def forecasting(output_path):
    import sys, subprocess, pickle
    
    subprocess.run([sys.executable, "-m", "pip", "install", "pandas", "scikit-learn", "matplotlib","tensorflow==2.3", "keras==2.4.3"])
    
    from tensorflow import keras
    import keras
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.metrics import accuracy_score, mean_squared_error
    from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
    scaler = MinMaxScaler(feature_range = (0,1))
        
    with open(f"{output_path}/train_data", "rb") as train:
        train_data = pickle.load(train)
        
    X_train, y_train = train_data
    
    with open(f"{output_path}/test_data", "rb") as test:
        test_data = pickle.load(test)
        
    X_test, y_test = test_data
    
    model = keras.models.load_model(f'{output_path}/lstm_model.h5')
    train_prediction = model.predict(X_train)
    test_prediction = model.predict(X_test)
    
    print(train_prediction[:5])
    print(test_prediction[:5])

    print("**********Mean Square Error and RMSE**********")
    print()
    print("===========================The Train Mean Suared Error is:=================================","\n")
    print(round(mean_squared_error(y_train, train_prediction),3),"\n")
    print("===========================The Train Root Mean Suared Error is:=================================","\n")
    print(round(np.sqrt(mean_squared_error(y_train, train_prediction)),3),"n")
    print("===========================The Test Mean Suared Error is:=================================","\n")
    print(round(mean_squared_error(y_test, test_prediction),3),"\n")
    print("===========================The Test Root Mean Suared Error is:=================================","\n")
    print(round(np.sqrt(mean_squared_error(y_test, test_prediction)),3))
    
    
    print("**********comparing the actual and predictions for the last 200 minutes**********")
#     data_list = [x for x in range(200)]
#     plt.figure(figsize=(8,4))
#     plt.plot(data_list, y_test[0][:200], marker='*', label="actual")
#     plt.plot(data_list, test_prediction[:,0][:200], '^', label="prediction")
#     plt.tight_layout()
#     sns.despine(top=True)
#     plt.subplots_adjust(left=0.07)
#     plt.ylabel('Global_active_power(KW)', size=15)
#     plt.xlabel('Time step', size=15)
#     plt.legend(fontsize=15)
#     plt.show();
    
    with open(f"{output_path}/predict.pkl", "wb") as prediction:
        pickle.dump(test_prediction, prediction)
        
    return "======Forecasting Finished Successfully====="

In [19]:
forecasting(output_dir)

[[0.2744965 ]
 [0.24158505]
 [0.3221066 ]
 [0.36155543]
 [0.37524563]]
[[0.2502526 ]
 [0.25677687]
 [0.25488603]
 [0.2554449 ]
 [0.25003466]]
=Since we transformed our train and test data, we need to inverse it to get the real predicted values=
**********Mean Square Error and RMSE**********


0.001 


0.025 n

0.0 


0.02
**********comparing the actual and predictions for the last 200 minutes**********




In [20]:
# def model_deploy(output_pat):
#     import subprocess, sys, pickle
    
#     subprocess([sys.executable, "-m", "pip", "install", "pandas", "scikit-learn"])
    
#     import pandas as pd
#     import numpy as np
    
    
#     ai_platform_deploy_operation = comp.load_component_from_url(
#     "https://storage.googleapis.com/{}/energy_bucket/deploy/model.pkl".format(BUCKET))
#     help(ai_platform_deploy_operation)
    

# Components Building

In [21]:
import kfp
from kfp import dsl
import kfp.components as comp

In [22]:
!which dsl-compile

In [23]:
data_download_comp = comp.func_to_container_op(data_ingestion, base_image= "python:3.7")

data_wrangled_comp = comp.func_to_container_op(data_wrangling, base_image="tensorflow/tensorflow:latest-gpu-py3")

data_cleaning_comp = comp.func_to_container_op(data_cleaning, base_image="tensorflow/tensorflow:latest-gpu-py3")

timeseries_check_comp =  comp.func_to_container_op(stationarity_check,base_image="tensorflow/tensorflow:latest-gpu-py3")

data_preprocess_comp = comp.func_to_container_op(data_preprocessing, base_image="tensorflow/tensorflow:latest-gpu-py3")

data_modelling_comp = comp.func_to_container_op(data_modelling, base_image="tensorflow/tensorflow:latest-gpu-py3")

forecasting_comp = comp.func_to_container_op(forecasting, base_image="tensorflow/tensorflow:latest-gpu-py3")

# Building the Pipeline

In [24]:
@dsl.pipeline(name ="Electric Power Consumption",
        description = "A ML Pipeline that load, clean, preprocess, train and forecast Electric Power Consumption")

def electric_power_consumption(input_path:str,
                               output_path:str):
    
    volume_op = dsl.VolumeOp(
        name="data_volume",
        resource_name="data-volume",
        size="3Gi",
        modes=dsl.VOLUME_MODE_RWO)
        
    download_container = data_download_comp(input_path, output_path)\
                                        .add_pvolumes({output_path: volume_op.volume})
    data_wrangled_container = data_wrangled_comp(output_path)\
                                        .add_pvolumes({output_path:download_container.pvolume})
    data_clean_container = data_cleaning_comp(output_path)\
                                        .add_pvolumes({output_path:data_wrangled_container.pvolume})
    timeseries_check_container = timeseries_check_comp(output_path)\
                                        .add_pvolumes({output_path:data_clean_container.pvolume})
    data_preprocess_container = data_preprocess_comp(output_path)\
                                        .add_pvolumes({output_path: timeseries_check_container.pvolume})
    data_modelling_container = data_modelling_comp(output_path)\
                                        .add_pvolumes({output_path:data_preprocess_container.pvolume})
    forecasting_conatiner = forecasting_comp(output_path)\
                                        .add_pvolumes({output_path:data_modelling_container.pvolume})
    

In [25]:
OUTPUT_PATH = '/mnt'

INPUT_PATH = ""

MODEL_PATH = 'lstm_model.h5'

In [26]:
pipeline_func = electric_power_consumption

In [27]:
client = kfp.Client()

In [28]:
experiment_name = 'Electric_Power_Consumption_Pipeline'
run_name = f'{pipeline_func.__name__} run'

arguments = {"input_path": INPUT_PATH,
             "output_path":OUTPUT_PATH,
             "output_path":MODEL_PATH
             }

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func, f'{experiment_name}.zip')

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)