# Section 1: Setup Azure ML Service environment

## Login to Azure

In [None]:
!az login

## Read global configuration variables from training_pipeline_config.json

In [None]:
# A training_pipeline_config.json file has to be created on same folder as this Jupyter Notebook
# copy and paste the following commented JSON block and replacing the "account_key" field
# with the real account_key value to connect to the Azure Blob Storage
#
# {
#     "compute_name" : "XXXXX",
#     "input_data_store_name" : "XXXXX",
#     "input_container_name" : "XXXXX",
#     "input_folder_path" : ".",
#     "account_name" : "XXXXX",
#     "account_key" : "XXXXX",
#     "train_csv_file" : "XXXXX.csv",
#     "model_name" : "sample",
#     "pipeline_name" : "Test Training Pipeline",
#     "pipeline_experiment_name" : "Test_Training_PEX",
#     "pipeline_schedule_name" : "Test_Training_Schedule",
#     "pipeline_schedule_frequency" : "Week",
#     "pipeline_schedule_interval" : 1,
#     "pipeline_schedule_time_of_day" : "12:00",
#     "pipeline_schedule_weeks_days" : ["Monday"]
# }
#
# Json fields documentation:
# - "compute_name" is the name of the AML Compute cluster to use from the ones configured in the AML Workspace.  Other optional values could be for example "cpucluster" or "dlcluser" (for GPU requirements)
# - "input_container_name" and "output_container_name" are the names of the input and output Blob Containers (gap-input and gap-output) from the "account_name" Blob Storage 
# - "input_data_store_name" and "output_data_store_name" are the internal AML names used to map respectively the gap-input and gap-output blob containers to the the AML Datastore.  These could have any string name.
# - "input_folder_path" is the path relative to the root folder of the gap-input container to the input CSV file to be used for batch scoring
# - "account_name" is the name of the Azure Blob Storage to use
# - "account_key" is the security key to access the "account_name" Azure Blob Storage
# - "train_csv_file" is the input csv file in the input folder
# - "model_name" is the name of the sample model as registered by the training pipeline
# - "pipeline_name" is the name of the Pipeline to create
# - "pipeline_experiment_name" is the name of Experiment to interactivily run the Pipeline as an Experiment
# - "pipeline_schedule_name" is the name of the Schedule for the Pipeline
# - "pipeline_schedule_frequency" is the unit of time that describes how often the schedule fires. Can be "Minute", "Hour", "Day", "Week", or "Month"
# - "pipeline_schedule_interval" is the value that specifies how often the schedule fires based on the frequency, which is the number of time units to wait until the schedule fires again
# - "pipeline_schedule_time_of_day" is a string in the form hh:mm specifing the time of the schedule recurrence. For example, if you specify "15:30" then the schedule will run at 3:30pm.
# - "pipeline_schedule_weeks_days" if you specify "Week" for pipeline_schedule_frequency, you can specify one or more days, separated by commas, when you want to run the workflow: "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", and "Sunday"

import json

with open('training_pipeline_config.json') as f:
    configuration = json.load(f)

## Setup Workspace

In [None]:
from azureml.core import Workspace, Experiment, Run, Datastore
ws = Workspace.from_config()

## Get Compute Clusters

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

def get_compute_cluster(ws, compute_name):
    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('found compute target. just use it. ' + compute_name)
            return compute_target
    else:
        print('Error getting Compute target...')

compute_target = get_compute_cluster(ws, configuration["compute_name"])

## Register External Blob datastores

In [None]:
input_data_store = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name=configuration["input_data_store_name"],
                                             container_name=configuration["input_container_name"],
                                             account_name=configuration["account_name"],
                                             account_key=configuration["account_key"],
                                             create_if_not_exists=False)

## Connect to Blob datastores

In [None]:
#list all datastores registered in current workspace
datastores = ws.datastores
for name, ds in datastores.items():
    print(name, ds.datastore_type)

In [None]:
#def_data_store = ws.get_default_datastore() 
input_data_store = Datastore(ws, configuration["input_data_store_name"])

## Create Data References to input file

In [None]:
from azureml.data.data_reference import DataReference

input_data_reference = DataReference(
    datastore=input_data_store,
    data_reference_name="input_data",
    path_on_datastore=configuration["input_folder_path"])

# Section 2: Prepare Python Scripts for Training

## Prepare Script Folder

In [None]:
import os
SCRIPT_FOLDER  = os.path.join(os.getcwd(), "training_scripts")
os.makedirs(SCRIPT_FOLDER, exist_ok=True)

## Sample Model Training Script

In [None]:
%%writefile $SCRIPT_FOLDER/sample_training.py

"""
This module implement the sample model training main script.
"""
import os
import pandas as pd
from azureml.core import Run
from sklearn.externals import joblib

LOCAL_MODEL_FILE = "samle.pkl"

def train(data_dir, train_csv_file, sparcity_threshold):
    """
    Main train script for the Sample Model Training
    """
    # Read the Data From File
    # features_df = pd.read_csv(os.path.join(data_dir, train_csv_file))

    # train model
    model = {}

    # locally save model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=model, filename="outputs/" + LOCAL_MODEL_FILE)
    print("Model saved")


if __name__ == "__main__":
    RUN = Run.get_context()
    RUN.log("Training step", 1)

    # Get Parameters
    PARSER = argparse.ArgumentParser("training")
    PARSER.add_argument('--input', type=str, \
        dest='data_dir', help='data storage reference in AML', required=True)
    PARSER.add_argument('--train_csv_file', \
        type=str, help='input csv file in the input folder', required=True)
    PARSER.add_argument('--model_name', \
        type=str, help='model name', required=True)

    ARGS = PARSER.parse_args()

    print("In basemodel_training.py")

    # Train / Test / Upload Model
    train(ARGS.data_dir, ARGS.train_csv_file)

    RUN.log("Training step", 2)
    print("Model trained")

    # Upload model
    print(RUN.get_file_names())
    RUN.upload_file(LOCAL_MODEL_FILE, "outputs/" + LOCAL_MODEL_FILE)
    MODEL = RUN.register_model(model_name=ARGS.model_name, model_path=LOCAL_MODEL_FILE)
    print(MODEL.name, MODEL.id, MODEL.version, sep='\t')
    RUN.log("Training step", 3)
    print("Model registered")

    RUN.complete()


# Section 3: Prepare Training Pipeline

## Create Pipeline

In [None]:
# from azureml.pipeline.steps.python_script_step import PythonScriptStep
from azureml.pipeline.core.graph import PipelineParameter
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.estimator import Estimator
from azureml.pipeline.steps import EstimatorStep

train_csv_file = PipelineParameter(
  name="train_csv_file", 
  default_value=configuration["train_csv_file"])

model_name = PipelineParameter(
  name="model_name", 
  default_value=configuration["model_name"])

sample_est = Estimator(source_directory=SCRIPT_FOLDER, 
                compute_target=compute_target, 
                entry_script='sample_training.py',
                conda_packages=['numpy','pandas', 'scikit-learn', 'tensorflow', 'keras'],
                pip_packages=['azure', 'azureml-core', 'azure-storage', 'azure-storage-blob']
               )

sample_step = EstimatorStep(name="Estimator_Base_Model_Train", 
                         estimator=sample_est, 
                         estimator_entry_script_arguments=[
                            "--input", input_data_reference,
                            "--train_csv_file", train_csv_file,
                            "--model_name", model_name
                         ],
                         runconfig_pipeline_params=None, 
                         inputs=[input_data_reference], 
                         compute_target=compute_target)

In [None]:
from azureml.pipeline.core import Pipeline

training_pipeline = Pipeline(workspace=ws, steps=[sample_step])

## Test Pipeline (Optional)

In [None]:
# Submit the pipeline to be run
pipeline_run1 = Experiment(ws, configuration["pipeline_experiment_name"]).submit(training_pipeline, show_output=True)
pipeline_run1.wait_for_completion()

## View Run Details

In [None]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run1).show()

## Publish Pipeline

In [None]:
published_pipeline = training_pipeline.publish(name=configuration["pipeline_name"])
published_pipeline

## Get published pipeline ID (Optional)

In [None]:
from azureml.pipeline.core import PublishedPipeline
from azureml.pipeline.core import Schedule, ScheduleRecurrence

all_pub_pipelines = PublishedPipeline.get_all(ws)

print("Published pipelines found in the workspace:")
for pub_pipeline in all_pub_pipelines:
    print(pub_pipeline.id, pub_pipeline.name)
    pub_pipeline_id = pub_pipeline.id


all_schedules = Schedule.get_all(ws, active_only=True) 
for schedule in all_schedules:
    print("{} {} pipeline: {}".format(schedule.id, schedule.name, schedule.pipeline_id))
    

## Remove previous Pipeline and Scheduling (Warning)

In [None]:
ensemble_training_pipeline = PublishedPipeline.get(ws, "85105a90-76fc-4136-90aa-ff0e079008e5")
print(ensemble_training_pipeline.name)
ensemble_training_schedule = Schedule.get(ws, "9d9a73f8-8441-415a-944e-bd9875e11640")
print(ensemble_training_schedule.name)

# ensemble_training_schedule.disable()
# ensemble_training_pipeline.disable()

## Schedule Pipeline

In [None]:
from azureml.pipeline.core import Schedule, ScheduleRecurrence
from azureml.pipeline.core.schedule import TimeZone

recurrence = ScheduleRecurrence(frequency=configuration["pipeline_schedule_name"], 
                                interval=configuration["pipeline_schedule_interval"], 
                                time_of_day=configuration["pipeline_schedule_time_of_day"],
#                                 start_time="2019-05-25T02:00:00",
                                week_days=configuration["pipeline_schedule_weeks_days"],
                                time_zone=TimeZone.CentralStandardTime)

schedule = Schedule.create(workspace=ws, 
                           name=configuration["pipeline_schedule_name"],
                           pipeline_id=published_pipeline.id, 
                           experiment_name=configuration["pipeline_schedule_name"] + '_Run',
                           recurrence=recurrence,
                           wait_for_provisioning=True,
                           description=configuration["pipeline_schedule_name"] + '_Run')

print("Created schedule with id: {}".format(schedule.id))

## Disable all Schedules and Pipelines (Warning)

In [None]:
all_schedules = Schedule.get_all(ws, active_only=True) 
for schedule in all_schedules:
    print("{} (Delete published pipeline: {}".format(schedule.id, schedule.pipeline_id))
    schedule.disable()

all_pub_pipelines = PublishedPipeline.get_all(ws)
for pub_pipeline in all_pub_pipelines:
    print(pub_pipeline.id)
    pub_pipeline.disable()