# Energy Price Forecasting Pipeline

In [1]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

OUTPUT_DIR = 'gs://pipelinestest/out'
PROJECT_NAME = 'energy-forecasting'
EF_IMAGE='gcr.io/%s/energy:dev' % PROJECT_NAME

### Create Image

In [2]:
%%docker {EF_IMAGE} {OUTPUT_DIR}
FROM tensorflow/tensorflow:1.10.0-py3
RUN apt-get update 
RUN apt-get install -y git
RUN pip3 install google-api-python-client
RUN pip3 install --upgrade google-cloud-bigquery
RUN pip3 install --upgrade google-cloud-storage
RUN pip3 install gitpython

2019-01-10 22:29:15:INFO:Checking path: gs://pipelinestest/out...
2019-01-10 22:29:15:INFO:Generate build files.
2019-01-10 22:29:15:INFO:Start a kaniko job for build.
2019-01-10 22:29:15:INFO:Cannot Find local kubernetes config. Trying in-cluster config.
2019-01-10 22:29:15:INFO:Initialized with in-cluster config.
2019-01-10 22:29:20:INFO:5 seconds: waiting for job to complete
2019-01-10 22:29:25:INFO:10 seconds: waiting for job to complete
2019-01-10 22:29:30:INFO:15 seconds: waiting for job to complete
2019-01-10 22:29:35:INFO:20 seconds: waiting for job to complete
2019-01-10 22:29:40:INFO:25 seconds: waiting for job to complete
2019-01-10 22:29:45:INFO:30 seconds: waiting for job to complete
2019-01-10 22:29:50:INFO:35 seconds: waiting for job to complete
2019-01-10 22:29:55:INFO:40 seconds: waiting for job to complete
2019-01-10 22:30:00:INFO:45 seconds: waiting for job to complete
2019-01-10 22:30:05:INFO:50 seconds: waiting for job to complete
2019-01-10 22:30:10:INFO:55 second

### Create Components

In [3]:
def run_git_python_script(
    inp: str,
    code_repo: str,
    code_folder: str,
    script: str,
    script_args: str) -> str:
    
    import os
    import git
    git.Git('').clone(code_repo)
    os.chdir(code_folder)
    output = os.system(' '.join([
        'python -m',
        script,
        script_args]))
    if output == 0:
        return('success')
    raise Exception('Script failed. The exit status was: {}'.format(output))
    
DataPrepOp = compiler.build_python_component(
    component_func = run_git_python_script,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-dp:latest')

2019-01-10 22:31:36:INFO:Build an image that is based on gcr.io/energy-forecasting/energy:dev and push the image to gcr.io/energy-forecasting/component-dp:latest
2019-01-10 22:31:36:INFO:Checking path: gs://pipelinestest/out...
2019-01-10 22:31:36:INFO:Generate entrypoint and serialization codes.
2019-01-10 22:31:36:INFO:Generate build files.
2019-01-10 22:31:36:INFO:Start a kaniko job for build.
2019-01-10 22:31:36:INFO:Cannot Find local kubernetes config. Trying in-cluster config.
2019-01-10 22:31:36:INFO:Initialized with in-cluster config.
2019-01-10 22:31:41:INFO:5 seconds: waiting for job to complete
2019-01-10 22:31:46:INFO:10 seconds: waiting for job to complete
2019-01-10 22:31:51:INFO:15 seconds: waiting for job to complete
2019-01-10 22:31:56:INFO:20 seconds: waiting for job to complete
2019-01-10 22:32:01:INFO:25 seconds: waiting for job to complete
2019-01-10 22:32:06:INFO:30 seconds: waiting for job to complete
2019-01-10 22:32:11:INFO:35 seconds: waiting for job to comple

In [4]:
def export_table(
    inp: str,
    table: str,
    file: str) -> str:
    
    from google.cloud import bigquery
    bq_client = bigquery.Client()
    bq_client.extract_table(
        table,
        file).result()
    return('success')
    
ExpTableOp = compiler.build_python_component(
    component_func = export_table,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-et:latest')

2019-01-10 22:32:51:INFO:Build an image that is based on gcr.io/energy-forecasting/energy:dev and push the image to gcr.io/energy-forecasting/component-et:latest
2019-01-10 22:32:51:INFO:Checking path: gs://pipelinestest/out...
2019-01-10 22:32:51:INFO:Generate entrypoint and serialization codes.
2019-01-10 22:32:51:INFO:Generate build files.
2019-01-10 22:32:52:INFO:Start a kaniko job for build.
2019-01-10 22:32:52:INFO:Cannot Find local kubernetes config. Trying in-cluster config.
2019-01-10 22:32:52:INFO:Initialized with in-cluster config.
2019-01-10 22:32:57:INFO:5 seconds: waiting for job to complete
2019-01-10 22:33:02:INFO:10 seconds: waiting for job to complete
2019-01-10 22:33:07:INFO:15 seconds: waiting for job to complete
2019-01-10 22:33:12:INFO:20 seconds: waiting for job to complete
2019-01-10 22:33:17:INFO:25 seconds: waiting for job to complete
2019-01-10 22:33:22:INFO:30 seconds: waiting for job to complete
2019-01-10 22:33:27:INFO:35 seconds: waiting for job to comple

In [5]:
def train_git(
    tr_inp: str,
    va_inp: str,
    code_repo: str,
    code_folder: str,
    project: str,
    bucket: str,
    package_folder: str,
    cmle_folder: str,
    scale_tier: str,
    python_module: str,
    region: str,
    runtime_version: str,
    cmle_args: str) -> str:
    
    import os
    import git
    import tarfile
    import datetime
    from google.cloud import storage
    from googleapiclient import discovery
    jobId = 'train' + datetime.datetime.today().strftime('%Y%m%d%H%M%S')
    git.Git('').clone(code_repo)
    with tarfile.open('code.tar.gz', 'w:gz') as tar:
        tar.add(
            code_folder,
            arcname=os.path.basename(code_folder))
    gcs_client = storage.Client()
    gcs_bucket = gcs_client.get_bucket(bucket)
    blob = gcs_bucket.blob(package_folder + jobId + '.tar.gz')
    blob.upload_from_filename('code.tar.gz')
    training_inputs = {
        'scaleTier': scale_tier,
        'pythonModule': python_module,
        'args': cmle_args.split(' '),
        'region': region,
        'packageUris': [
            'gs://'+ bucket + '/' + package_folder + jobId + '.tar.gz'],
        'jobDir': 'gs://'+ bucket + '/' + cmle_folder + jobId,
        'runtimeVersion': runtime_version}
    job_spec = {
        'jobId': jobId,
        'trainingInput': training_inputs}
    cloudml = discovery.build('ml', 'v1')
    project_id = 'projects/{}'.format(project)
    request = cloudml.projects().jobs().create(
        body=job_spec,
        parent=project_id)
    return(str(request.execute()))

    
TrainModelOp = compiler.build_python_component(
    component_func = train_git,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-tm:latest')

2019-01-10 22:34:08:INFO:Build an image that is based on gcr.io/energy-forecasting/energy:dev and push the image to gcr.io/energy-forecasting/component-tm:latest
2019-01-10 22:34:08:INFO:Checking path: gs://pipelinestest/out...
2019-01-10 22:34:08:INFO:Generate entrypoint and serialization codes.
2019-01-10 22:34:08:INFO:Generate build files.
2019-01-10 22:34:08:INFO:Start a kaniko job for build.
2019-01-10 22:34:08:INFO:Cannot Find local kubernetes config. Trying in-cluster config.
2019-01-10 22:34:08:INFO:Initialized with in-cluster config.
2019-01-10 22:34:13:INFO:5 seconds: waiting for job to complete
2019-01-10 22:34:18:INFO:10 seconds: waiting for job to complete
2019-01-10 22:34:23:INFO:15 seconds: waiting for job to complete
2019-01-10 22:34:28:INFO:20 seconds: waiting for job to complete
2019-01-10 22:34:33:INFO:25 seconds: waiting for job to complete
2019-01-10 22:34:38:INFO:30 seconds: waiting for job to complete
2019-01-10 22:34:43:INFO:35 seconds: waiting for job to comple

### Create pipeline

In [6]:
@dsl.pipeline(
    name='Energy Price Forecasting',
    description='Energy Price Forecasting')
def basic_bq_pipeline(
    project = dsl.PipelineParam(
        'project',
        value='energy-forecasting'),
    bucket = dsl.PipelineParam(
        'bucket',
        value='energyforecast'),
    code_repo = dsl.PipelineParam(
        'code-repo',
        value='https://github.com/GoogleCloudPlatform/professional-services.git'),
    code_folder = dsl.PipelineParam(
        'code-folder',
        value='professional-services/examples/cloudml-energy-price-forecasting'),
    data_prep_script = dsl.PipelineParam(
        'data-prep-script',
        value='data_preparation.data_prep'),
    data_prep_args = dsl.PipelineParam(
        'data-prep-args',
        value=' '.join([
            '--dataset=Energy',
            '--train_table=MLDataTrain',
            '--valid_table=MLDataValid',
            '--test_table=MLDataTest',
            '--prepare_data_file=data_preparation/prepare_data.sql',
            '--weather_mean_std_file=data_preparation/weather_mean_std.sql',
            '--train_from_date="2015-01-05 00:00:00"',
            '--train_to_date="2015-10-04 23:01:00"',
            '--valid_from_date="2015-10-05 00:00:00"',
            '--valid_to_date="2015-10-11 23:01:00"',
            '--test_from_date="2015-10-12 00:00:00"',
            '--test_to_date="2015-10-18 23:01:00"',
            '--price_scaling=0.01',
            '--mean_path=gs://energyforecast/data/pickle/mean.pkl',
            '--std_path=gs://energyforecast/data/pickle/std.pkl'])),
    package_folder = dsl.PipelineParam(
        'package-folder',
        value='package/'),
    cmle_folder = dsl.PipelineParam(
        'cmle-folder',
        value='cmle/'),
    cmle_args = dsl.PipelineParam(
        'cmle-args',
        value=' '.join([
            '--training_path', 'gs://energyforecast/data/csv/MLDataTrain.csv',
            '--validation_path', 'gs://energyforecast/data/csv/MLDataValid.csv',
            '--mean_path', 'gs://energyforecast/data/pickle/mean.pkl',
            '--std_path', 'gs://energyforecast/data/pickle/std.pkl',
            '--dropout' , '0.2',
            '--hour_embedding', '20',
            '--day_embedding', '10',
            '--first_layer_size', '100',
            '--number_layers', '3',
            '--layer_reduction_fraction', '0.5'
            '--learning_rate', '0.01',
            '--batch_size', '64',
            '--eval_batch_size', '168',
            '--max_steps', '5000'])),
    scale_tier = dsl.PipelineParam(
        'scale-tier',
        value='BASIC'),
    python_module = dsl.PipelineParam(
        'python-module',
        value='trainer.task'),
    region = dsl.PipelineParam(
        'region',
        value='us-central1'),
    runtime_version = dsl.PipelineParam(
        'runtime-version',
        value='1.10'),
    train_table = dsl.PipelineParam(
        'train-table',
        value='Energy.MLDataTrain'),
    valid_table = dsl.PipelineParam(
        'valid-table',
        value='Energy.MLDataValid'),
    test_table = dsl.PipelineParam(
        'test-table',
        value='Energy.MLDataTest'),
    train_file = dsl.PipelineParam(
        'train-file',
        value='gs://energyforecast/data/csv/MLDataTrain.csv'),
    valid_file = dsl.PipelineParam(
        'valid-file',
        value='gs://energyforecast/data/csv/MLDataValid.csv'),
    test_file = dsl.PipelineParam(
        'test-file',
        value='gs://energyforecast/data/csv/MLDataTest.csv')):
    
    dp_op = DataPrepOp(
        'start',
        code_repo,
        code_folder,
        data_prep_script,
        data_prep_args).apply(gcp.use_gcp_secret('user-gcp-sa'))
    tr_et_op = ExpTableOp(
        dp_op.output,
        train_table,
        train_file).apply(gcp.use_gcp_secret('user-gcp-sa'))
    va_et_op = ExpTableOp(
        dp_op.output,
        valid_table,
        valid_file).apply(gcp.use_gcp_secret('user-gcp-sa'))
    te_et_op = ExpTableOp(
        dp_op.output,
        test_table,
        test_file).apply(gcp.use_gcp_secret('user-gcp-sa'))
    tm_op = TrainModelOp(
        tr_et_op.output,
        va_et_op.output,
        code_repo,
        code_folder,
        project,
        bucket,
        package_folder,
        cmle_folder,
        scale_tier,
        python_module,
        region,
        runtime_version,
        cmle_args).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
compiler.Compiler().compile(basic_bq_pipeline, 'ef.tar.gz')

# Run Rxperiment

In [7]:
import datetime

client = kfp.Client()
experiment = client.get_experiment(
    experiment_name='Energy Price Forecasting Experiment')
run_name = 'exp' + datetime.datetime.today().strftime("%Y%m%d%H%M%S")
params = {}
client.run_pipeline(experiment.id, run_name, 'ef.tar.gz', params)

{'created_at': datetime.datetime(2019, 1, 10, 22, 35, 47, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'id': '12ec34cb-1528-11e9-b276-42010a800121',
 'metrics': None,
 'name': 'exp20190110223547',
 'pipeline_spec': {'parameters': None,
                   'pipeline_id': None,
                   'pipeline_manifest': None,
                   'workflow_manifest': '{"apiVersion": '
                                        '"argoproj.io/v1alpha1", "kind": '
                                        '"Workflow", "metadata": '
                                        '{"generateName": '
                                        '"energy-price-forecasting-"}, "spec": '
                                        '{"arguments": {"parameters": '
                                        '[{"name": "project", "value": '
                                        '"energy-forecasting"}, {"name": '
                                        '"bucket", "value": "energyforecast"}, '
                       