In [15]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.48.0 to work with project


In [16]:
from azureml.core import Dataset
from azureml.data.datapath import DataPath

default_ds = ws.get_default_datastore()

if 'churn dataset' not in ws.datasets:
    Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'churn-data/')
                              )

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'churn-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='telco dataset',
                                description='chrun data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Validating arguments.
Arguments validated.
Uploading file to churn-data/
Uploading an estimated of 3 files
Target already exists. Skipping upload for churn-data/.amlignore
Target already exists. Skipping upload for churn-data/.amlignore.amltmp
Target already exists. Skipping upload for churn-data/telco-csv.csv
Uploaded 0 files
Creating new dataset
Dataset registered.


In [17]:
import os

experiment_folder = 'churn_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


In [24]:
%%writefile $experiment_folder/churn_training.py
# Import libraries
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve

# Get the experiment run context
run = Run.get_context()

# Get script arguments
parser = argparse.ArgumentParser()

# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

# Hyperparameters
parser.add_argument('--learning_rate', type=float, dest='learning_rate', default=0.1, help='learning rate')
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')
#parser.add_argument('--min_samples_leaf', type=float, dest='min_samples_leaf', default=1, help='min_samples_leaf')
#parser.add_argument('--max_depth', type=int, dest='max_depth', default=1, help='max_depth')

# Add arguments to args collection
args = parser.parse_args()

# Log Hyperparameter values
run.log('learning_rate',  np.float(args.learning_rate))
run.log('n_estimators',  np.int(args.n_estimators))
#run.log('min_samples_leaf',  np.int(args.min_samples_leaf))
#run.log('max_depth',  np.int(args.max_depth))

# load the churn dataset
print("Loading Data...")
churn = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input

churn2 = churn.copy()

# drop those colume have null values
churn2 = churn2.drop(['loglong','logtoll','logequi','logcard','logwire','lninc'],axis = 1)

# separating num, categorical and target
numerical = ["tenure","age","address","income","employ","reside","longmon","tollmon","equipmon","cardmon","wiremon","longten","tollten","equipten",'cardten',"wireten"]
categorical =["marital","retire","gender","voice","pager","internet","callid","callwait","forward","confer","ebill","ed","tollfree","equip","callcard","wireless","multline","region","custcat"]
target = ['churn']

# convert categorical to num
for x in churn2.columns:
    if churn2[x].dtypes == 'object':
        churn2[x] = pd.Categorical(churn[x]).codes


# Separate features and labels
X, y = churn2[['region', 'tenure', 'age', 'marital', 'address', 'income', 'ed',
       'employ', 'retire', 'gender', 'reside', 'tollfree', 'equip', 'callcard',
       'wireless', 'longmon', 'tollmon', 'equipmon', 'cardmon', 'wiremon',
       'longten', 'tollten', 'equipten', 'cardten', 'wireten', 'multline',
       'voice', 'pager', 'internet', 'callid', 'callwait', 'forward', 'confer',
       'ebill',
       'custcat']].values, churn2['churn'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a Gradient Boosting classification model with the specified hyperparameters
print('Training a classification model')
model = GradientBoostingClassifier(learning_rate=args.learning_rate,
                                   n_estimators=args.n_estimators).fit(X_train, y_train)
#model = DecisionTreeClassifier(min_samples_leaf=args.min_samples_leaf,
#                                max_depth=args.max_depth).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/churn_model.pkl')

run.complete()

Overwriting churn_training-hyperdrive/churn_training.py


In [19]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "your-churn-compute-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [20]:
%%writefile $experiment_folder/experiment_churn_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting churn_training-hyperdrive/experiment_churn_env.yml


In [25]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
hyper_env = Environment.from_conda_specification("experiment_churn_env", experiment_folder + "/experiment_churn_env.yml")

# Get the training dataset
churn_ds = ws.datasets.get("telco dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='churn_training.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', churn_ds.as_named_input('training_data')],
                                environment=hyper_env,
                                compute_target = pipeline_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # Hyperdrive will try 6 combinations, adding these as script arguments
        '--learning_rate': choice(0.01, 0.1, 1.0),
        '--n_estimators' : choice(10, 100)
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, # No early stopping policy
                          primary_metric_name='AUC', # Find the highest AUC metric
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6, # Restict the experiment to 6 iterations
                          max_concurrent_runs=2) # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='mslearn-churn-hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8',
 'target': 'your-churn-compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2023-02-03T12:19:41.552392Z',
 'endTimeUtc': '2023-02-03T12:23:46.031536Z',
 'services': {},
 'properties': {'primary_metric_config': '{"name":"AUC","goal":"maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '6d5fe175-4d3b-419a-9667-d021a3f80e55',
  'user_agent': 'python/3.8.10 (Linux-5.15.0-1031-azure-x86_64-with-glibc2.17) msrest/0.7.1 Hyperdrive.Service/1.0.0 Hyperdrive.SDK/core.1.48.0',
  'space_size': '6',
  'score': '0.7763447971781304',
  'best_child_run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_1',
  'best_metric_status': 'Succeeded',
  'best_data_container_id': 'dcid.HD_ea630086-6c76-4ae8-9e34-c3354d595be8_1'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'configuration': None,
  'attribution': None,
  'telemetryValues': {'

KeyError: 'log_files'

In [26]:
# Print all child runs, sorted by the primary metric
for child_run in run.get_children_sorted_by_primary_metric():
    print(child_run)

# Get the best run, and its metrics and arguments
best_run = run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
script_arguments = best_run.get_details() ['runDefinition']['arguments']
print('Best Run Id: ', best_run.id)
print(' -AUC:', best_run_metrics['AUC'])
print(' -Accuracy:', best_run_metrics['Accuracy'])
print(' -Arguments:',script_arguments)

{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_1', 'hyperparameters': '{"--learning_rate": 0.01, "--n_estimators": 100}', 'best_primary_metric': 0.7763447971781304, 'status': 'Completed'}
{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_2', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 10}', 'best_primary_metric': 0.7736717372134039, 'status': 'Completed'}
{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_0', 'hyperparameters': '{"--learning_rate": 0.01, "--n_estimators": 10}', 'best_primary_metric': 0.7619598765432098, 'status': 'Completed'}
{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_3', 'hyperparameters': '{"--learning_rate": 0.1, "--n_estimators": 100}', 'best_primary_metric': 0.7595348324514991, 'status': 'Completed'}
{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3354d595be8_4', 'hyperparameters': '{"--learning_rate": 1.0, "--n_estimators": 10}', 'best_primary_metric': 0.7152226631393297, 'status': 'Completed'}
{'run_id': 'HD_ea630086-6c76-4ae8-9e34-c3