In [None]:
from azureml.core import Workspace, Datastore, Dataset
from azureml.core import Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.runconfig import RunConfiguration
from azureml.core.compute_target import ComputeTargetException
from azureml.core.script_run_config import ScriptRunConfig
from azureml.core.conda_dependencies import CondaDependencies
import os
import shutil

In [None]:
ws = Workspace.from_config()

experiment_name = '<ExperimentName>'
experiment = Experiment(workspace = ws, name = experiment_name)

In [None]:
# Creating a local folder
!rm -rf ./train-on-amlcompute
project_folder = './train-on-amlcompute'
os.makedirs(project_folder, exist_ok=True)
os.mknod("./train-on-amlcompute/train.py")

# Provisioning a compute target

In [None]:
# Choose a name for your CPU cluster
cpu_cluster_name = "<ClusterName>"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)


# Training a model

In [None]:
%%writefile train-on-amlcompute/train.py

import numpy as np
import pandas as pd
from azureml.core import Run
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
import os

# Create an in-memory Dataset on your local machine
df = pd.read_csv("/data/prepped_train.csv")

# Getting labels and features from our dataset
labels =  df['HasDetections']
features = df.drop(columns=['HasDetections'])

xTrain, xTest, yTrain, yTest = train_test_split(features, labels, test_size=0.30)

# Get hold of the current run
run = Run.get_context()

# Train a classifier
# Create a pipeline (in case we want to add transformations later)
pipeline = Pipeline([('classify', RandomForestClassifier()),
                        ])
# Use the pipeline to fit a model to the training data
print("Training model...")
classifier = pipeline.fit(xTrain, yTrain)
print('Classifier trained!')

# Evaluate classifier
classes = ['Has detections','Does not have detections']
print('Calculating classifier metrics...')
predictions = classifier.predict(xTest)
print(metrics.classification_report(yTest, predictions, target_names=classes))

acc = metrics.accuracy_score(yTest, predictions)
print('Accuracy:' + str(acc))

# Saving the accuracy as a metric to our workspace
run.log('accuracy', np.float(acc))

print("\n Confusion Matrix:")
cm = confusion_matrix(yTest, np.round(predictions, 0))
print(cm)


os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=classifier, filename='outputs/challenge3model.pkl')

# Configuring our cluster and running it

In [None]:
from azureml.core import Run
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DataReferenceConfiguration

dataReference = DataReferenceConfiguration(datastore_name="<YourDataStoreName>",
                                           path_on_compute="/data",
                                           path_on_datastore="prepped",
                                           mode="download",
                                           overwrite=False)

# create a new RunConfig object
run_config = RunConfiguration(framework="python")

# Set compute target to AmlCompute target created in previous step
run_config.target = cpu_cluster.name

run_config.data_references = {'myDataStore':dataReference}

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn','pandas'])

src = ScriptRunConfig(source_directory=project_folder, 
                      script='train.py',
                      run_config=run_config)

run = experiment.submit(config=src)
run

In [None]:
%%time
# Shows output of the run on stdout.
run.wait_for_completion(show_output=True)

In [None]:
run.get_metrics()

# Working with Estimators and Hyperdrive

In [None]:
!pip install --upgrade azureml-sdk[notebooks]

In [None]:
from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import HyperDriveRunConfig
from azureml.widgets import RunDetails
from azureml.train.hyperdrive import RandomParameterSampling
from azureml.train.hyperdrive import BanditPolicy
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import normal, uniform, choice
from azureml.core.runconfig import DataReferenceConfiguration
import os 


os.makedirs('./train-on-amlcompute', exist_ok=True)

script_params = {
    '--data-folder': '/data',
    '--regularization': 0.8
}

param_sampling = RandomParameterSampling( {
        "learning_rate": normal(10, 3),
        "keep_probability": uniform(0.05, 0.1),
        "batch_size": choice(16, 32, 64, 128)
    }
)

dataReference = DataReferenceConfiguration(datastore_name="<YourDataStoreName>",
                                           path_on_compute="/data",
                                           path_on_datastore="prepped",
                                           mode="download",
                                           overwrite=False)

cpu_cluster.data_references = {'myDataStore':dataReference}

estimator = Estimator(source_directory='./train-on-amlcompute',
                   script_params=script_params,
                   compute_target=cpu_cluster,
                   entry_script='train.py',
                   conda_packages=['scikit-learn','pandas'])

early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

primary_metric_name="accuracy",
primary_metric_goal=PrimaryMetricGoal.MAXIMIZE

hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="accuracy", 
                          primary_metric_goal=primary_metric_goal,
                          max_total_runs=100,
                          max_concurrent_runs=4)


experiment_name = '<YourNewExperimentName>'
experiment = Experiment(workspace = ws, name = experiment_name)

hyperdrive_run = experiment.submit(hyperdrive_run_config)