## Loading Data

In [45]:
import pandas as pd

df = pd.read_csv('traintest.csv')
df.drop(columns='id', inplace=True)
df.head(1)

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,overview,release_date,release_month,release_quarter,release_year,runtime,tagline,title,Is Christmas Movie,keywords
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Two employees at a gift shop can barely stand ...,1940-01-11,1.0,1.0,1940.0,99.0,"Quiet, Unassuming, Perfect",The Shop Around the Corner,1,"holiday,budapest,hungary,secret love,love,gift..."


## Get a Reference to Azure
In order to perform Machine Learning, we need a Machine Learning Workspace. We'll get it from a config file.

In [46]:
from azureml.core import Workspace, Experiment, Dataset, Model

# Load the workspace information from config.json using the Azure ML SDK
ws = Workspace.from_config()
ws.name

'2022-data-science-talks'

## Registering a DataSet on Azure

In [47]:
# Get the storage account associated with this ML workspace
datastore = ws.get_default_datastore()
datastore.name

'workspaceblobstore'

In [48]:

ds = Dataset.Tabular.register_pandas_dataframe(dataframe=df, 
        name='ChristmasMovies', 
        description='Movies broken down by Christmas movies and non-Christmas movies', 
        target=datastore)
ds.name

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/47b416d6-6fd4-4431-9141-a70e3f29c9c0/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


'ChristmasMovies'

## Create a Compute Resource
We'll need some compute resources to run the experiment

In [49]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Now let's make sure we have a compute resource
cluster_name = "Low-End-Cluster"
max_nodes = 4

# Fetch or create the compute resource
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cluster_name) # This will throw a ComputeTargetException if this doesn't exist
    print('Using existing compute: ' + cluster_name)
except ComputeTargetException:
    # Create the cluster
    print('Provisioning cluster...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2DS_V4", 
                                                           min_nodes=0, 
                                                           max_nodes=max_nodes, 
                                                           vm_priority='lowpriority')
    cpu_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

# Ensure the cluster is ready to go
cpu_cluster.wait_for_completion(show_output=True)

Provisioning cluster...
InProgress.
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Create the Machine Learning Experiment
This will hold runs of our experiment so we can track progress over time

In [50]:
from azureml.core.experiment import Experiment

# Create a Machine Learning Experiment
experiment_name = 'DieHard-AutoML'

experiment=Experiment(ws, experiment_name)
experiment.name

'DieHard-AutoML'

## Submit the Experiment
This asks Azure to run the experiment and waits for it to complete

In [76]:
from azureml.train.automl import AutoMLConfig

# Set up the experiment
automl_config = AutoMLConfig(
    task='classification',                  # The machine learning task we're trying to accomplish
    primary_metric='AUC_weighted',          # How we judge one model as better than another. AUC tends to be fairly balanced
    training_data=ds,                       # Our dataset of movies
    enable_dnn=True,                        # Enable Deep Learning
    compute_target=cpu_cluster,             # The compute resource to use
    max_concurrent_iterations=max_nodes,    # Don't want more concurrent iterations than CPU nodes
    iteration_timeout_minutes=5,            # The maximum number of minutes per individual run
    blocked_models=['XGBoostClassifier'],
    label_column_name='Is Christmas Movie') # The value we want to predict for future values

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [77]:
from azureml.widgets import RunDetails

# Submit the experiment
run = experiment.submit(automl_config)

# Wait for the experiment to complete
RunDetails(run).show()
run.wait_for_completion(show_output=False)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
DieHard-AutoML,AutoML_0837072e-c1c4-48e6-b762-03932913f864,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Using default datastore for uploads
INFO:interpret_community.common.explanation_utils:Usin

{'runId': 'AutoML_0837072e-c1c4-48e6-b762-03932913f864',
 'target': 'Low-End-Cluster',
 'status': 'Completed',
 'startTimeUtc': '2022-08-30T22:34:35.008689Z',
 'endTimeUtc': '2022-08-30T23:11:41.185993Z',
 'services': {},
   'message': 'No scores improved over last 10 iterations, so experiment stopped early. This early stopping behavior can be disabled by setting enable_early_stopping = False in AutoMLConfig for notebook/python SDK runs.'}],
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'Low-End-Cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"ba12795b-3071-4b29-9f1e-252e0ce3211e\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azur

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

## Working with the Resulting Model
Now that we have a number of models, including the best performing model, let's save it locally for deployment later

In [73]:
# Grab the resulting model and best run
best_auto_run, automl_model = run.get_output()

# Display details about the best run
RunDetails(best_auto_run).show()

Package:azureml-automl-runtime, training version:1.44.0, current version:1.40.0
Package:azureml-core, training version:1.44.0, current version:1.40.0
Package:azureml-dataprep, training version:4.2.2, current version:3.0.2
Package:azureml-dataprep-rslex, training version:2.8.1, current version:2.4.2
Package:azureml-dataset-runtime, training version:1.44.0, current version:1.40.0
Package:azureml-defaults, training version:1.44.0, current version:1.40.0
Package:azureml-inference-server-http, training version:0.7.4, current version:0.4.13
Package:azureml-interpret, training version:1.44.0, current version:1.40.0
Package:azureml-mlflow, training version:1.44.0, current version:1.40.0
Package:azureml-pipeline-core, training version:1.44.0, current version:1.40.0
Package:azureml-telemetry, training version:1.44.0, current version:1.40.0
Package:azureml-train-automl-client, training version:1.44.0, current version:1.40.0
Package:azureml-train-automl-runtime, training version:1.44.0, current ve

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [59]:
# Save the best model locally
best_auto_run.download_files(output_directory='automl-output')

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [56]:
# Register the model in Azure
model = best_auto_run.register_model(model_name='ChristmasMovie-AutoML', 
                                     model_path='outputs/model.pkl', 
                                     description='Predict whether or not a movie is a Christmas movie')

## Get Results

In [74]:
import pandas as pd

df_dieHard = pd.read_csv('DieHard.csv')
df_dieHard.drop(columns=['id', 'Is Christmas Movie'], inplace=True)
df_dieHard.head()

Unnamed: 0,Is Action,Is Adventure,Is Animation,Is Comedy,Is Crime,Is Documentary,Is Drama,Is Family,Is Fantasy,Is Foreign,...,adult,overview,release_date,release_month,release_quarter,release_year,runtime,tagline,title,keywords
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,"NYPD cop, John McClane's plan to reconcile wit...",1988-07-15,7.0,3.0,1988.0,131.0,40 Stories. Twelve Terrorists. One Cop.,Die Hard,"helicopter,journalist,based on novel,terrorist..."


In [60]:
import joblib

final_model = joblib.load('./automl-output/outputs/model.pkl')
final_model

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=True, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='c:\\Dev\\DieHardSolver\\AutoML')),
                ('pr...
                 PreFittedSoftVotingClassifier(classification_labels=array([0, 1], dtype=int64), estimators=[('6', Pipeline(memory=None, steps=[('sparsenormalizer', Normalizer(copy=True, norm='l2')), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced', criterion='gini', max_depth=None, max_features='sqrt', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=0.01, min_samples_split=0.01, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=True, random_state=None,

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [66]:
%pip install -Iv xgboost==1.3.3

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


Using pip 20.2.3 from c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip (python 3.8)
Non-user install because site-packages writeable
Created temporary directory: C:\Users\Admin\AppData\Local\Temp\pip-ephem-wheel-cache-klsv6whi
Created temporary directory: C:\Users\Admin\AppData\Local\Temp\pip-req-tracker-rdiiu9gi
Initialized build tracking at C:\Users\Admin\AppData\Local\Temp\pip-req-tracker-rdiiu9gi
Created build tracker: C:\Users\Admin\AppData\Local\Temp\pip-req-tracker-rdiiu9gi
Entered build tracker: C:\Users\Admin\AppData\Local\Temp\pip-req-tracker-rdiiu9gi
Created temporary directory: C:\Users\Admin\AppData\Local\Temp\pip-install-n92r4x1r
1 location(s) to search for versions of xgboost:
* https://pypi.org/simple/xgboost/
Fetching project page and analyzing links: https://pypi.org/simple/xgboost/
Getting page https://pypi.org/simple/xgboost/
Found index url https://pypi.org/simple
Looking up "https://pypi.org/simple/xgboost/" in the cache
Request header h

ERROR: Could not install packages due to an EnvironmentError.
Consider using the `--user` option or check the permissions.
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip\_internal\commands\install.py", line 397, in run
    installed = install_given_reqs(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip\_internal\req\__init__.py", line 82, in install_given_reqs
    requirement.install(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip\_internal\req\req_install.py", line 814, in install
    install_wheel(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip\_internal\operations\install\wheel.py", line 852, in install_wheel
    _install_wheel(
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python38\lib\site-packages\pip\_internal\operations\install\wheel.py", line 689, in _install_wheel
    file.save()
  File "c:\

INFO:interpret_community.common.explanation_utils:Using default datastore for uploads


In [75]:
# Make a prediction
prediction = automl_model.predict(df_dieHard)
print('Is Christmas Movie Prediction: ' + str(prediction[0]))

PredictionException: PredictionException:
	Message: 'XGBClassifier' object has no attribute 'enable_categorical'
	InnerException: AttributeError: 'XGBClassifier' object has no attribute 'enable_categorical'
	ErrorResponse 
{
    "error": {
        "code": "SystemError",
        "message": "Encountered an internal AutoML error. Error Message/Code: PredictionException. Additional Info: PredictionException:\n\tMessage: 'XGBClassifier' object has no attribute 'enable_categorical'\n\tInnerException: None\n\tErrorResponse \n{\n    \"error\": {\n        \"message\": \"'XGBClassifier' object has no attribute 'enable_categorical'\",\n        \"target\": \"Xgboost\",\n        \"reference_code\": \"Xgboost\"\n    }\n}",
        "details_uri": "https://aka.ms/automltroubleshoot",
        "target": "Xgboost",
        "inner_error": {
            "code": "ClientError",
            "inner_error": {
                "code": "AutoMLInternal"
            }
        },
        "reference_code": "Xgboost"
    }
}

In [62]:
# Get the prediction probability
probability = final_model.predict_proba(df_dieHard)[0]
print('Non-Christmas Movie Prediction: ' + '{:.2f}%'.format(probability[0] * 100))
print('Christmas Movie Prediction: ' + '{:.2f}%'.format(probability[1] * 100))


PredictionException: PredictionException:
	Message: 'XGBClassifier' object has no attribute 'enable_categorical'
	InnerException: AttributeError: 'XGBClassifier' object has no attribute 'enable_categorical'
	ErrorResponse 
{
    "error": {
        "code": "SystemError",
        "message": "Encountered an internal AutoML error. Error Message/Code: PredictionException. Additional Info: PredictionException:\n\tMessage: 'XGBClassifier' object has no attribute 'enable_categorical'\n\tInnerException: None\n\tErrorResponse \n{\n    \"error\": {\n        \"message\": \"'XGBClassifier' object has no attribute 'enable_categorical'\",\n        \"target\": \"Xgboost\",\n        \"reference_code\": \"Xgboost\"\n    }\n}",
        "details_uri": "https://aka.ms/automltroubleshoot",
        "target": "Xgboost",
        "inner_error": {
            "code": "ClientError",
            "inner_error": {
                "code": "AutoMLInternal"
            }
        },
        "reference_code": "Xgboost"
    }
}