# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [1]:
from azureml.core import Dataset, Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

## Dataset

### Overview

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'jobchangeautoml'

experiment=Experiment(ws, experiment_name)

In [3]:
dataset = Dataset.get_by_name(ws, 'traindata')
#dataset = dataset.to_pandas_dataframe().drop('enrollee_id',axis=1)
dataset

{
  "source": [
    "('workspaceblobstore', 'UI/01-31-2021_101329_UTC/aug_train.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "87499c16-f708-47cf-9c5b-83d2f2202e74",
    "name": "traindata",
    "version": 1,
    "workspace": "Workspace.create(name='quick-starts-ws-136708', subscription_id='a24a24d5-8d87-4c8a-99b6-91ed2d2df51f', resource_group='aml-quickstarts-136708')"
  }
}

In [4]:
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           max_nodes=5)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [5]:
# TODO: Put your automl settings here
automl_settings = {
    "experiment_timeout_minutes": 15,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 5
}
# TODO: Put your automl config here
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="target", 
                             enable_early_stopping= True,
                             featurization= 'auto',
                             **automl_settings
                            )

In [6]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)


Running on remote.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster
Parent Run ID: AutoML_4e968e5c-5cbd-4b5a-8325-df62fb150e34

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expe

## Run Details


In [7]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
jobchangeautoml,AutoML_4e968e5c-5cbd-4b5a-8325-df62fb150e34,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [9]:
remote_run.wait_for_completion()


{'runId': 'AutoML_4e968e5c-5cbd-4b5a-8325-df62fb150e34',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-01-31T10:20:43.853768Z',
 'endTimeUtc': '2021-01-31T10:47:19.04084Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"beef11ba-a4fb-4369-a12f-dc43cfc7c044\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"UI/01-31-2021_101329_UTC/aug_train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-136708\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"a24a24d5-8d87-4c8a-99b6-91ed2d2df51f\\

## Best Model

In [10]:
best_run, fitted_model = remote_run.get_output()
best_run_metrics = best_run.get_metrics()

In [11]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
jobchangeautoml,AutoML_4e968e5c-5cbd-4b5a-8325-df62fb150e34_41,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [12]:
fitted_model

PipelineWithYTransformations(Pipeline={'memory': None,
                                       'steps': [('datatransformer',
                                                  DataTransformer(enable_dnn=None,
                                                                  enable_feature_sweeping=None,
                                                                  feature_sweeping_config=None,
                                                                  feature_sweeping_timeout=None,
                                                                  featurization_config=None,
                                                                  force_text_dnn=None,
                                                                  is_cross_validation=None,
                                                                  is_onnx_compatible=None,
                                                                  logger=None,
                                                              

In [13]:
print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print(fitted_model._final_estimator)
print(best_run.get_tags())

Best Run Id:  AutoML_4e968e5c-5cbd-4b5a-8325-df62fb150e34_41

 Accuracy: 0.7991443007788893
PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('28',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=0.7922222222222222,
                                                                               importance_type='split',
                                                                         

In [14]:
import os
import joblib

os.makedirs('./outputs', exist_ok=True)

joblib.dump(fitted_model, filename='outputs/automl.joblib')

model_name = best_run.properties['model_name']
model_name

'AutoML4e968e5c541'

In [15]:
env = best_run.get_environment()

script_file = 'score.py'

best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.


In [32]:
from azure.automl.core.shared import constants
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'myenv.yml')

ModuleNotFoundError: No module named 'azure.automl'

In [16]:
#Register the fitted model
model = remote_run.register_model(model_name = model_name, #AutoMLac80c775a36
                                  description = 'AutoML model')

In [17]:
inference_config = InferenceConfig(entry_script = script_file, environment = env)

aci_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

aci_service_name = 'automl-job-change'
print(aci_service_name)

automl-job-change


In [29]:
service = Model.deploy(ws, aci_service_name, [model], inference_config, aci_config)
service.wait_for_deployment(True)
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.............................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
State: Healthy
Scoring URI: http://835fd430-24bf-452d-91fe-930a84454f98.southcentralus.azurecontainer.io/score


In [31]:
service.update(enable_app_insights=True)

In [30]:
%run endpoint.py

{"result": [0.0, 0.0]}


TODO: In the cell below, send a request to the web service you deployed to test it.

In [20]:
dataset.to_pandas_dataframe().head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,city_40,0.78,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,city_21,0.62,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,city_115,0.79,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,city_162,0.77,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [21]:
data_sample = dataset.to_pandas_dataframe().sample(3)
y_true = data_sample.pop('target')
sample_json = json.dumps({'data':data_sample.to_dict(orient='records')})
print(sample_json)

{"data": [{"city": "city_75", "city_development_index": 0.9390000000000001, "gender": null, "relevent_experience": "No relevent experience", "enrolled_university": "no_enrollment", "education_level": null, "major_discipline": null, "experience": "1", "company_size": null, "company_type": null, "last_new_job": "1", "training_hours": 75}, {"city": "city_97", "city_development_index": 0.925, "gender": "Male", "relevent_experience": "Has relevent experience", "enrolled_university": "no_enrollment", "education_level": "Masters", "major_discipline": "STEM", "experience": "8", "company_size": "10000+", "company_type": "Pvt Ltd", "last_new_job": "2", "training_hours": 124}, {"city": "city_103", "city_development_index": 0.92, "gender": "Male", "relevent_experience": "Has relevent experience", "enrolled_university": "no_enrollment", "education_level": "Phd", "major_discipline": "STEM", "experience": ">20", "company_size": "100-500", "company_type": "Funded Startup", "last_new_job": "3", "traini

In [22]:
output = service.run(sample_json)
print('Prediction: ', output)
print('True Values: ', y_true.values)

Prediction:  {"result": [0.0, 0.0, 0.0]}
True Values:  [0. 0. 0.]


In [23]:
service.get_logs()



TODO: In the cell below, print the logs of the web service and delete the service

In [24]:
service.delete()