In [None]:
from azureml.core import Workspace, Experiment

# Configure experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="titanic-lgbm")

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

def get_aml_cluster(ws, cluster_name, vm_size='STANDARD_D2_V2', max_nodes=4):
    try:
        cluster = ComputeTarget(workspace=ws, name=cluster_name)
    except ComputeTargetException:
        config = AmlCompute.provisioning_configuration(vm_size=vm_size, max_nodes=max_nodes)
        cluster = ComputeTarget.create(ws, cluster_name, config)
    return cluster

In [None]:
# Create or get training cluster
aml_cluster = get_aml_cluster(ws, cluster_name="cpu-cluster")
aml_cluster.wait_for_completion(show_output=True)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('data/train.csv')
df.drop(['PassengerId'], axis=1, inplace=True)

# 'Embarked' is stored as letters, so fit a label encoder to the train set to use in the loop
embarked_encoder = LabelEncoder()
embarked_encoder.fit(df['Embarked'].fillna('Null'))
 
# Record anyone travelling alone
df['Alone'] = (df['SibSp'] == 0) & (df['Parch'] == 0)

# Transform 'Embarked'
df['Embarked'].fillna('Null', inplace=True)
df['Embarked'] = embarked_encoder.transform(df['Embarked'])

# Transform 'Sex'
df.loc[df['Sex'] == 'female','Sex'] = 0
df.loc[df['Sex'] == 'male','Sex'] = 1
df['Sex'] = df['Sex'].astype('int8')

# Drop features that seem unusable. Save passenger ids if test
df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
import os
from azureml.core import Dataset

def df_to_dataset(ws, df, name):
    datastore = ws.get_default_datastore()
    dataset = Dataset.Tabular.register_pandas_dataframe(df, datastore, name)
    return dataset

In [None]:
ds = df_to_dataset(ws, df, 'titanic_cleaned')

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
 
def get_run_config(target, packages=None):
    packages = packages or []
    packages += ['azureml-defaults']
    config = RunConfiguration()
    config.target = target
    config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=packages)
    return config

In [None]:
# Create a remote run configuration
lgbm_config = get_run_config(aml_cluster, [
    'numpy', 'pandas', 'matplotlib', 'seaborn', 'scikit-learn', 'joblib', 'lightgbm'
])

In [None]:
script_params = [
    '--data', ds.as_named_input('titanic'),
    '--boosting', 'dart',
    '--learning-rate', '0.05',
    '--drop-rate', '0.15',
]

In [None]:
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

script = 'train_lightgbm.py'
script_folder = os.getcwd()

src = ScriptRunConfig(
  source_directory=script_folder,
  script=script,
  run_config=lgbm_config,
  arguments=script_params)

run = exp.submit(src)

RunDetails(run).show()

In [None]:
print(run.get_portal_url())
