**Automated ML**

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [None]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

**Dataset**

***Overview***

TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.

TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

**Create or Attach an AmlCompute cluster¶**

You will need to create a compute target for your AutoML run. In this tutorial, you get the default AmlCompute as your training compute resource.

Udacity Note There is no need to create a new compute target, it can re-use the previous cluster

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# NOTE: update the cluster name to match the existing cluster
# Choose a name for your CPU cluster
amlcompute_cluster_name = "automl-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',# for GPU, use "STANDARD_NC6"
                                                           #vm_priority = 'lowpriority', # optional
                                                           max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)
# For a more detailed view of current AmlCompute status, use get_status().

**Data**

**Udacity note:**

Make sure the key is the same name as the dataset that is uploaded, and that the description matches. If it is hard to find or unknown, loop over the ws.datasets.keys() and print() them. If it isn't found because it was deleted, it can be recreated with the link that has the CSV

In [None]:
# Try to load the dataset from the Workspace. Otherwise, create it from the file
# NOTE: update the key to match the dataset name
found = False
key = "LV_cleaned_data"
description_text = "LV hotel reviews from trip advisor"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key] 

if not found:
        # Create AML Dataset and register it into Workspace
        LV_cleaned_data = 'https://raw.githubusercontent.com/Kbhamidipa3/udacityazure_capstone_final/main/LasVegasTripAdvisorReviews-Dataset.csv'
        dataset = Dataset.Tabular.from_delimited_files(LV_cleaned_data)        
        #Register Dataset in Workspace
        dataset = dataset.register(workspace=ws,
                                   name=key,
                                   description=description_text)


df = dataset.to_pandas_dataframe()
config = ScriptRunConfig(source_directory='training', script='train.py', compute_target='cpu-cluster')

**Review the Dataset Result**

You can peek the result of a TabularDataset at any range using skip(i) and take(j).to_pandas_dataframe(). Doing so evaluates only j records for all the steps in the TabularDataset, which makes it fast even against large datasets.

TabularDataset objects are composed of a list of transformation steps (optional).

In [None]:
dataset.take(5).to_pandas_dataframe()

**Split into train and test data**

In [None]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

 
x, y = clean_data(ds)
train_data, test_data = dataset.random_split(percentage=0.75, seed=223)
x_train = train_data.pop_columns(columns=['Score'])
y_train = train_data.Score
x_test, y_train, y_test 
data_train = pd.concat([x_train,y_train], axis=1)

os.makedirs('data_train', exist_ok=True)

local_path = './data_train/data_train.csv'
data_train.to_csv(local_path)

# upload the local file to a datastore on the cloud
workspace = Workspace(ws.subscription_id, ws.resource_group, ws.name)

# get the datastore to upload prepared data
datastore = ws.get_default_datastore()

# upload the local file from src_dir to the target_path in datastore
datastore.upload(src_dir='data_train', target_path='data_train')

# create a dataset referencing the cloud location
data_train = Dataset.Tabular.from_delimited_files(path = [(datastore, ('data_train/data_train.csv'))])

**AutoML Configuration**

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [None]:
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="Score",   
                             path = project_folder,
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

**Submit your automl run**

In [None]:
experiment_name = 'lv-automl-classification'
experiment = Experiment(ws, experiment_name)
run = experiment.submit(automl_config, show_output=True)

Results
Widget for Monitoring Runs
The widget will first report a "loading" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.

Note: The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details

In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()

**Analyze results**

Retrieve the Best Model
Below we select the best pipeline from our iterations. The get_output method on automl_classifier returns the best run and the fitted model for the last invocation. Overloads on get_output allow you to retrieve the best run and fitted model for any logged metric or for a particular iteration.

In [None]:
best_run, fitted_model = run.get_output()
print(best_run)
print(fitted_model)

**Tests**

Now that the model is trained, split the data in the same way the data was split for training (The difference here is the data is being split locally) and then run the test data through the trained model to get the predicted values.

In [None]:
# convert the test data to dataframe
X_test_df = test_data.pop(columns=[label_column_name]).to_pandas_dataframe()
y_test_df = test_data.Score

In [None]:
# call the predict functions on the model
y_pred = fitted_model.predict(X_test_df)
y_pred

**Calculate metrics for the prediction**

Now visualize the data on a scatter plot to show what our truth (actual) values are compared to the predicted values from the trained model that was returned.

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
import itertools

cf =confusion_matrix(y_test_df.values,y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
class_labels = ['False','True']
tick_marks = np.arange(len(class_labels))
plt.xticks(tick_marks,class_labels)
plt.yticks([-0.5,0,1,1.5],['','False','True',''])
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
    plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show()

**Run the explanation**

Download the engineered feature importance from artifact store
You can use ExplanationClient to download the engineered feature explanations from the artifact store of the best_run. You can also use azure portal url to view the dash board visualization of the feature importance values of the engineered features.

In [None]:
client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)
print(engineered_explanations.get_feature_importance_dict())
print("You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n" + best_run.get_portal_url())

**Download the raw feature importance from artifact store**

You can use ExplanationClient to download the raw feature explanations from the artifact store of the best_run. You can also use azure portal url to view the dash board visualization of the feature importance values of the raw features.

In [None]:
raw_explanations = client.download_model_explanation(raw=True)
print(raw_explanations.get_feature_importance_dict())
print("You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n" + best_run.get_portal_url())

**Retrieve any other AutoML model from training**

In [None]:
automl_run, fitted_model = run.get_output(metric='accuracy')

**Setup the model explanations for AutoML models**

The fitted_model can generate the following which will be used for getting the engineered explanations using automl_setup_model_explanations:-

Featurized data from train samples/test samples
Gather engineered name lists
Find the classes in your labeled column in classification scenarios
The automl_explainer_setup_obj contains all the structures from above list.

In [None]:
X_train = training_data.drop_columns(columns=[label_column_name])
y_train = training_data.keep_columns(columns=[label_column_name], validate=True)
X_test = validation_data.drop_columns(columns=[label_column_name])

In [None]:
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations

automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, X=X_train, 
                                                             X_test=X_test, y=y_train, 
                                                             task='classification')

**Initialize the Mimic Explainer for feature importance**

For explaining the AutoML models, use the MimicWrapper from azureml-interpret package. The MimicWrapper can be initialized with fields in automl_explainer_setup_obj, your workspace and a surrogate model to explain the AutoML model (fitted_model here). The MimicWrapper also takes the automl_run object where engineered explanations will be uploaded.

In [None]:
from interpret.ext.glassbox import LGBMExplainableModel
from azureml.interpret.mimic_wrapper import MimicWrapper
explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator,
                         explainable_model=automl_explainer_setup_obj.surrogate_model, 
                         init_dataset=automl_explainer_setup_obj.X_transform, run=automl_run,
                         features=automl_explainer_setup_obj.engineered_feature_names, 
                         feature_maps=[automl_explainer_setup_obj.feature_map],
                         classes=automl_explainer_setup_obj.classes,
                         explainer_kwargs=automl_explainer_setup_obj.surrogate_model_params)

**Use Mimic Explainer for computing and visualizing engineered feature importance**

The explain() method in MimicWrapper can be called with the transformed test samples to get the feature importance for the generated engineered features. You can also use azure portal url to view the dash board visualization of the feature importance values of the engineered features.

In [None]:
# Compute the engineered explanations
engineered_explanations = explainer.explain(['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)
print(engineered_explanations.get_feature_importance_dict())
print("You can visualize the engineered explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n" + automl_run.get_portal_url())

**Use Mimic Explainer for computing and visualizing raw feature importance**

The explain() method in MimicWrapper can be called with the transformed test samples to get the feature importance for the original features in your data. You can also use azure portal url to view the dash board visualization of the feature importance values of the original/raw features.

In [None]:
# Compute the raw explanations
raw_explanations = explainer.explain(['local', 'global'], get_raw=True,
                                     raw_feature_names=automl_explainer_setup_obj.raw_feature_names,
                                     eval_dataset=automl_explainer_setup_obj.X_test_transform,
                                     raw_eval_dataset=automl_explainer_setup_obj.X_test_raw)
print(raw_explanations.get_feature_importance_dict())
print("You can visualize the raw explanations under the 'Explanations (preview)' tab in the AutoML run at:-\n" + automl_run.get_portal_url())

**Initialize the scoring Explainer, save and upload it for later use in scoring explanation**

In [None]:
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer
import joblib

# Initialize the ScoringExplainer
scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])

# Pickle scoring explainer locally to './scoring_explainer.pkl'
scoring_explainer_file_name = 'scoring_explainer.pkl'
with open(scoring_explainer_file_name, 'wb') as stream:
    joblib.dump(scoring_explainer, stream)

# Register trained automl model present in the 'outputs' folder in the artifacts
original_model = automl_run.register_model(model_name='automl_model', 
                                           model_path='outputs/model.pkl')

# Upload the scoring explainer to the automl run
automl_run.upload_file('outputs/scoring_explainer.pkl', scoring_explainer_file_name)
scoring_explainer_model = automl_run.register_model(model_name='scoring_explainer', model_path='scoring_explainer.pkl')

**Create the conda dependencies for setting up the service**

In [None]:
from azureml.core.conda_dependencies import CondaDependencies

azureml_pip_packages = [
    'azureml-interpret', 'azureml-train-automl', 'azureml-defaults'
]

myenv = CondaDependencies.create(conda_packages=['scikit-learn', 'pandas', 'numpy', 'py-xgboost<=0.80'],
                                 pip_packages=azureml_pip_packages,
                                 pin_sdk_version=True)

with open("myenv.yml","w") as f:
    f.write(myenv.serialize_to_string())

with open("myenv.yml","r") as f:
    print(f.read())

**Deploy the service**

In [None]:
from azureml.core.webservice import Webservice
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model, InferenceConfig
from azureml.core.environment import Environment

aciconfig = AciWebservice.deploy_configuration(cpu_cores=1,
                                               memory_gb=1,
                                               tags={"data": "LV_cleaned_data",  
                                                     "method" : "local_explanation"},
                                               description='Get local explanations for Bank marketing test data')
myenv = Environment.from_conda_specification(name="myenv", file_path="myenv.yml")
inference_config = InferenceConfig(entry_script="score_local_explain.py", environment=myenv)

# Use configs and models generated above
service = Model.deploy(ws,
                       'model-scoring',
                       [scoring_explainer_model, original_model],
                       inference_config,
                       aciconfig)
service.wait_for_deployment(show_output=True)

**View the service logs**

In [None]:
service.get_logs()

**Inference with test data**

In [None]:
if service.state == 'Healthy':
    # Serialize the first row of the test data into json
    X_test_json = X_test[:1].to_json(orient='records')
    print(X_test_json)
    # Call the service to get the predictions and the engineered explanations
    output = service.run(X_test_json)
    # Print the predicted value
    print(output['predictions'])
    # Print the engineered feature importances for the predicted value
    print(output['engineered_local_importance_values'])

**Delete the service.**

In [None]:
service.delete()

**Delete the cluster at the end of the run**

In [None]:
cpu_cluster.delete()