# A machine learning decision tree approach

The iMeta algorithm is essentially a decision tree algorithm, where the variables and threshold for the decisions at each step are manually specified based on human analysis. The simplest way to apply machine learning techniques to the problem would be to use a similar structure to iMeta, which is a decision tree, but use standard ML training techiniques to learn the parameters such as what thresholds to use and how many branches/leaves to have in the tree for the best results. 

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import sys
import pathlib

In [2]:
import ipywidgets
import time

In [3]:
import pandas
import numpy
import matplotlib
import matplotlib.pyplot

In [4]:
import sklearn
import sklearn.model_selection
import sklearn.preprocessing
import sklearn.tree
import sklearn.metrics

In [5]:
root_repo_dir = pathlib.Path().absolute().parent
sys.path = [os.path.join(root_repo_dir)] + sys.path

In [6]:
from azureml.train.sklearn import SKLearn

In [8]:
import dataexploration.xbt_dataset
from dataexploration.xbt_dataset import XbtDataset, UNKNOWN_STR, cat_output_formatter, check_value_found
from classification.xbt_azureml import AzureDataset
from classification.imeta import imeta_classification, XBT_MAX_DEPTH

In [9]:
# Set up some site specific parameters for the notebook
try:
    environment = os.environ['XBT_ENV_NAME']
except KeyError:
    environment = 'azureml'

In [10]:
# AZURE ML SPECIFIC
azure_working_root = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/xbt-test1/code/Users/stephen.haddad'
xbt_compute_cluster_name = 'xbt-cluster'
xbt_vm_size = 'STANDARD_D2_V2'
xbt_max_nodes = 4


In [11]:
root_data_dirs = {
    'MO_scitools': '/data/users/shaddad/xbt-data/',
    'pangeo': '/data/misc/xbt-data/',
    'azureml': os.path.join(azure_working_root, 'xbt-data'),
}
env_date_ranges = {
    'MO_scitools': (1966,2015),
    'pangeo': (1966,2015),
    'azureml': (1966,2015),
}

In [12]:
# Set up some dataset specific parameters
root_data_dir = root_data_dirs[environment]
year_range = env_date_ranges[environment]

In [13]:
experiment_name = 'nb_azml_single_decisionTree_country'
classifier_class = sklearn.tree.DecisionTreeClassifier
classifier_opts = {'max_depth': 20,
                   'min_samples_leaf': 1,
                   'criterion': 'gini'
                  }
classifier_name = 'decision_tree'
suffix='country'

In [14]:
cv_metric_names = ['f1_weighted','precision_weighted','recall_weighted']
input_feature_names = ['country','max_depth', 'year']

In [15]:
input_dir_name = 'csv_with_imeta'
exp_out_dir_name = 'experiment_outputs'

In [16]:
exp_json_path = os.path.join(root_repo_dir, 'examples', 'xbt_param_decisionTree_country_dev.json')

In [17]:
# AZURE ML SPECIFIC
from azureml.core import Workspace, Dataset

subscription_id = '1fedcbc3-e156-45f5-a034-c89c2fc0ac61'
resource_group = 'AWSEarth'
workspace_name = 'stephenHaddad_xbt_europeWest'

xbt_workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
# AZURE ML SPECIFIC
from azureml.core import Experiment
experiment = Experiment(workspace=xbt_workspace, name=experiment_name)

## Prepare/access the compute
If we want to use an AzureML clsuter for training, cross-validation, hyperparameter tuning etc. we need to create an object to access (and potentially start up) a suitable compute cluster.



In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster


try:
    compute_target = ComputeTarget(workspace=xbt_workspace, name=xbt_compute_cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size=xbt_vm_size, 
                                                           max_nodes=xbt_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(xbt_workspace, xbt_compute_cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

## Run the XBT training/evaluation script

In [None]:
script_params = {
    '--input-path': xbt_input_dir,
    '--output-path': xbt_output_dir,
    '--json-experiment': exp_json_path,
}
conda_packages = ['python=3.8',
                  'joblib=0.13.2',
                  'pandas=1.0.1',
                  'scikit-learn=0.22.1',
                  'iris=2.4',
                 ]



In [None]:
launch_script = 'bin/run_azml_experiment'


In [None]:
# we need to upload the experiment file to a blob registered to our datastore
# then we need to pass in the relevant info to find the experiment definition through the tags, which will point to the datastore and the blob within it
# the azure ML experiment will need to get this file (download it perhaps?)
# or it can just be copied with the project directory and read from examples?

TODO: making this work on AzML
* use dataset mount to mount the dataset. this will need to happen on the compute cluster. Don't need to download
  * also use mount in the notebooks with local compute
* make sure the source code is copied through the source_directory argument
* ensure correct conda packages, same for compute instance and cluster
  * might need docker longer term
* upload experiment file to be used to a run, and then get that file inside the script

In [None]:
estimator = SKLearn(source_directory=str(root_repo_dir), 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script=launch_script,
                    conda_packages=conda_packages,
                   )

In [None]:
xbt_single_run = experiment.submit(estimator)

In [None]:
xbt_single_run.download_file(xbt_single_run.get_file_names()[0])

In [None]:
xbt_single_run.get_file_names()[0]

In [None]:
os.listdir(os.getcwd())

In [None]:
from azureml.widgets import RunDetails

RunDetails(xbt_single_run).show()