The purpose of this notebook is to tune the hyperparameters associated with our candidate models to arrive at an optimum configuration.  It should be run on a cluster leveraging Databricks ML 7.1+ and **GPU-based** nodes.

###Load & Transform Data

To get started, we'll re-load our data, applying transformations to features to address issues related to missing data, categorical values & feature standardization.  This step is a repeat of work introduced and explained in the last notebook:

In [0]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import average_precision_score
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

from hyperopt import hp, fmin, tpe, SparkTrials, STATUS_OK, space_eval
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

import numpy as np

import time

In [0]:
# retreive training & testing data
train = spark.sql('''
  SELECT
    a.*,
    b.days_total,
    b.days_with_session,
    b.ratio_days_with_session_to_days,
    b.days_after_exp,
    b.days_after_exp_with_session,
    b.ratio_days_after_exp_with_session_to_days_after_exp,
    b.sessions_total,
    b.ratio_sessions_total_to_days_total,
    b.ratio_sessions_total_to_days_with_session,
    b.sessions_total_after_exp,
    b.ratio_sessions_total_after_exp_to_days_after_exp,
    b.ratio_sessions_total_after_exp_to_days_after_exp_with_session,
    b.seconds_total,
    b.ratio_seconds_total_to_days_total,
    b.ratio_seconds_total_to_days_with_session,
    b.seconds_total_after_exp,
    b.ratio_seconds_total_after_exp_to_days_after_exp,
    b.ratio_seconds_total_after_exp_to_days_after_exp_with_session,
    b.number_uniq,
    b.ratio_number_uniq_to_days_total,
    b.ratio_number_uniq_to_days_with_session,
    b.number_uniq_after_exp,
    b.ratio_number_uniq_after_exp_to_days_after_exp,
    b.ratio_number_uniq_after_exp_to_days_after_exp_with_session,
    b.number_total,
    b.ratio_number_total_to_days_total,
    b.ratio_number_total_to_days_with_session,
    b.number_total_after_exp,
    b.ratio_number_total_after_exp_to_days_after_exp,
    b.ratio_number_total_after_exp_to_days_after_exp_with_session,
    c.is_churn
  FROM kkbox.train_trans_features a
  INNER JOIN kkbox.train_act_features b
    ON a.msno=b.msno
  INNER JOIN kkbox.train c
    ON a.msno=c.msno
  ''').toPandas()

test = spark.sql('''
  SELECT
    a.*,
    b.days_total,
    b.days_with_session,
    b.ratio_days_with_session_to_days,
    b.days_after_exp,
    b.days_after_exp_with_session,
    b.ratio_days_after_exp_with_session_to_days_after_exp,
    b.sessions_total,
    b.ratio_sessions_total_to_days_total,
    b.ratio_sessions_total_to_days_with_session,
    b.sessions_total_after_exp,
    b.ratio_sessions_total_after_exp_to_days_after_exp,
    b.ratio_sessions_total_after_exp_to_days_after_exp_with_session,
    b.seconds_total,
    b.ratio_seconds_total_to_days_total,
    b.ratio_seconds_total_to_days_with_session,
    b.seconds_total_after_exp,
    b.ratio_seconds_total_after_exp_to_days_after_exp,
    b.ratio_seconds_total_after_exp_to_days_after_exp_with_session,
    b.number_uniq,
    b.ratio_number_uniq_to_days_total,
    b.ratio_number_uniq_to_days_with_session,
    b.number_uniq_after_exp,
    b.ratio_number_uniq_after_exp_to_days_after_exp,
    b.ratio_number_uniq_after_exp_to_days_after_exp_with_session,
    b.number_total,
    b.ratio_number_total_to_days_total,
    b.ratio_number_total_to_days_with_session,
    b.number_total_after_exp,
    b.ratio_number_total_after_exp_to_days_after_exp,
    b.ratio_number_total_after_exp_to_days_after_exp_with_session,
    c.is_churn
  FROM kkbox.test_trans_features a
  INNER JOIN kkbox.test_act_features b
    ON a.msno=b.msno
  INNER JOIN kkbox.test c
    ON a.msno=c.msno
  ''').toPandas()

# split into features and labels
X_train_raw = train.drop(['msno','is_churn'], axis=1)
y_train = train['is_churn']

X_test_raw = test.drop(['msno','is_churn'], axis=1)
y_test = test['is_churn']

In [0]:
display(X_train_raw)

start_year,start_month,subscription_age,renewals,total_list_price,total_amount_paid,total_discount,days_since_last_account_action,last_plan_list_price,last_actual_amount_paid,last_discount,last_payment_plan_days,last_payment_method,last_is_cancel,last_is_auto_renew,last_change_in_list_price,last_change_in_discount,last_change_in_payment_plan_days,last_change_in_payment_method_id,last_change_in_cancellation,last_change_in_auto_renew,last_days_change_in_membership_expire_date,days_until_expiration,total_subscription_count,city,bd,gender,registered_via,days_total,days_with_session,ratio_days_with_session_to_days,days_after_exp,days_after_exp_with_session,ratio_days_after_exp_with_session_to_days_after_exp,sessions_total,ratio_sessions_total_to_days_total,ratio_sessions_total_to_days_with_session,sessions_total_after_exp,ratio_sessions_total_after_exp_to_days_after_exp,ratio_sessions_total_after_exp_to_days_after_exp_with_session,seconds_total,ratio_seconds_total_to_days_total,ratio_seconds_total_to_days_with_session,seconds_total_after_exp,ratio_seconds_total_after_exp_to_days_after_exp,ratio_seconds_total_after_exp_to_days_after_exp_with_session,number_uniq,ratio_number_uniq_to_days_total,ratio_number_uniq_to_days_with_session,number_uniq_after_exp,ratio_number_uniq_after_exp_to_days_after_exp,ratio_number_uniq_after_exp_to_days_after_exp_with_session,number_total,ratio_number_total_to_days_total,ratio_number_total_to_days_with_session,number_total_after_exp,ratio_number_total_after_exp_to_days_after_exp,ratio_number_total_after_exp_to_days_after_exp_with_session
2015,1,731,24,3347,3476,-129,19,149,149,0,30,41,0,1,0,0,0,0,0,0,31,-12,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,4,274,9,891,891,0,9,99,99,0,30,41,0,1,0,0,0,0,0,0,31,-22,1,,,,,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,1,731,24,3576,3576,0,16,149,149,0,30,41,0,1,0,0,0,0,0,0,31,-15,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,8,153,5,495,495,0,20,99,99,0,30,41,0,1,0,0,0,0,0,0,31,-11,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,1,730,23,2558,2587,-29,28,99,99,0,30,41,0,1,0,0,0,0,0,0,31,-3,1,5.0,47.0,1.0,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,3,677,23,3128,3108,20,18,149,149,0,30,41,0,1,0,0,0,0,0,0,31,-13,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,2,335,11,1089,1089,0,28,99,99,0,30,41,0,1,0,0,0,0,0,0,31,-3,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,5,245,8,1192,1192,0,15,149,149,0,30,40,0,1,0,0,0,0,0,0,31,-15,1,,,,,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2017,1,0,0,0,0,0,27,99,99,0,30,41,0,1,0,0,0,0,-1,1,0,-4,2,1.0,,,7.0,27,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,7,184,6,894,894,0,28,149,149,0,30,40,0,1,0,0,0,0,0,0,31,-23,1,22.0,33.0,0.0,4.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [0]:
X_test_raw.columns

In [0]:
display(X_test_raw)

start_year,start_month,subscription_age,renewals,total_list_price,total_amount_paid,total_discount,days_since_last_account_action,last_plan_list_price,last_actual_amount_paid,last_discount,last_payment_plan_days,last_payment_method,last_is_cancel,last_is_auto_renew,last_change_in_list_price,last_change_in_discount,last_change_in_payment_plan_days,last_change_in_payment_method_id,last_change_in_cancellation,last_change_in_auto_renew,last_days_change_in_membership_expire_date,days_until_expiration,total_subscription_count,city,bd,gender,registered_via,days_total,days_with_session,ratio_days_with_session_to_days,days_after_exp,days_after_exp_with_session,ratio_days_after_exp_with_session_to_days_after_exp,sessions_total,ratio_sessions_total_to_days_total,ratio_sessions_total_to_days_with_session,sessions_total_after_exp,ratio_sessions_total_after_exp_to_days_after_exp,ratio_sessions_total_after_exp_to_days_after_exp_with_session,seconds_total,ratio_seconds_total_to_days_total,ratio_seconds_total_to_days_with_session,seconds_total_after_exp,ratio_seconds_total_after_exp_to_days_after_exp,ratio_seconds_total_after_exp_to_days_after_exp_with_session,number_uniq,ratio_number_uniq_to_days_total,ratio_number_uniq_to_days_with_session,number_uniq_after_exp,ratio_number_uniq_after_exp_to_days_after_exp,ratio_number_uniq_after_exp_to_days_after_exp_with_session,number_total,ratio_number_total_to_days_total,ratio_number_total_to_days_with_session,number_total_after_exp,ratio_number_total_after_exp_to_days_after_exp,ratio_number_total_after_exp_to_days_after_exp_with_session
2015,1,762,25,3496,3625,-129,16,149,149,0,30,41,0,1,0,0,0,0,0,0,28,-12,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,4,305,10,990,990,0,6,99,99,0,30,41,0,1,0,0,0,0,0,0,28,-22,1,,,,,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2017,2,0,0,0,0,0,6,100,100,0,30,41,0,1,0,0,0,0,-1,1,0,-21,1,1.0,,,7.0,6,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,1,762,25,3725,3725,0,13,149,149,0,30,41,0,1,0,0,0,0,0,0,28,-15,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,8,184,6,594,594,0,17,99,99,0,30,41,0,1,0,0,0,0,0,0,28,-11,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,1,761,24,2657,2686,-29,25,99,99,0,30,41,0,1,0,0,0,0,0,0,28,-3,1,5.0,47.0,1.0,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2015,3,708,24,3277,3257,20,15,149,149,0,30,41,0,1,0,0,0,0,0,0,28,-13,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,2,366,12,1188,1188,0,25,99,99,0,30,41,0,1,0,0,0,0,0,0,28,-3,1,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2016,5,276,9,1341,1341,0,12,149,149,0,30,40,0,1,0,0,0,0,0,0,28,-15,1,,,,,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
2017,1,31,1,99,99,0,24,99,99,0,30,41,0,1,0,0,0,0,0,0,28,-4,2,1.0,,,7.0,30,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0


In [0]:
# replace missing values
impute = ColumnTransformer(
  transformers=[('missing values', SimpleImputer(strategy='most_frequent'), ['last_payment_method', 'city', 'gender', 'registered_via', 'bd'])],
  remainder='passthrough'
  )

# encode categoricals and scale all others
encode_scale =  ColumnTransformer( 
  transformers= [('ohe categoricals', OneHotEncoder(categories='auto', drop='first'), slice(0,4))], # features 0 through 3 should be the first four features imputed in previous step
  remainder= StandardScaler()  # standardize all other features
  )

# package transformation logic
transform = Pipeline([
   ('impute', impute),
   ('encode_scale', encode_scale)
   ])

# transform data
X_train = transform.fit_transform(X_train_raw)
X_test = transform.transform(X_test_raw)

The first part of the model evaluation function retrieves from memory replicated copies of our training and testing feature and label sets.  Our intent is to leverage SparkTrials in combination with hyperopt to parallelize the training of models across a Spark cluster, allowing us to perform multiple, simultaneous model training evaluation runs and reduce the overall time required to navigate the seach space.  By replicating our datasets to the worker nodes of the cluster, a task performed in the next cell, copies of the data needed for training and evaluation can be efficiently made available to the function with minimal networking overhead:

**NOTE** See the Distributed Hyperopt [best practices documentation](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/hyperopt-best-practices.html#handle-datasets-of-different-orders-of-magnitude-notebook) for more options for data distribution.

In [0]:
X_train_broadcast = sc.broadcast(X_train)
X_test_broadcast = sc.broadcast(X_test)
y_train_broadcast = sc.broadcast(y_train)
y_test_broadcast = sc.broadcast(y_test)

The hyperparameter values delivered to the function by hyperopt are derived from a search space defined in the next cell.  Each hyperparameter in the search space is defined using an item in a dictionary, the name of which identifies the hyperparameter and the value of which defines a range of potential values for that parameter.  When defined using *hp.choice*, a parameter is selected from a predefined list of values.  When defined *hp.loguniform*, values are generated from a continuous range of values.  When defined using *hp.quniform*, values are generated from a continuous range but truncated to a level of precision identified by the third argument  in the range definition.  Hyperparameter search spaces in hyperopt may be defined in many other ways as indicated by the library's [online documentation](https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions):

In [0]:
# define minimum positive class scale factor (as shown in previous notebook)
weights = compute_class_weight(
  'balanced', 
  classes=np.unique(y_train), 
  y=y_train
  )
scale = weights[1]/weights[0]

# define hyperopt search space
search_space = {
    'max_depth' : hp.quniform('max_depth', 1, 30, 1)                                  # depth of trees (preference is for shallow trees or even stumps (max_depth=1))
    ,'learning_rate' : hp.loguniform('learning_rate', np.log(0.01), np.log(0.40))     # learning rate for XGBoost
    ,'gamma': hp.quniform('gamma', 0.0, 1.0, 0.001)                                   # minimum loss reduction required to make a further partition on a leaf node
    ,'min_child_weight' : hp.quniform('min_child_weight', 1, 20, 1)                   # minimum number of instances per node
    ,'subsample' : hp.loguniform('subsample', np.log(0.1), np.log(1.0))               # random selection of rows for training,
    ,'colsample_bytree' : hp.loguniform('colsample_bytree', np.log(0.1), np.log(1.0)) # proportion of columns to use per tree
    ,'colsample_bylevel': hp.loguniform('colsample_bylevel', np.log(0.1), np.log(1.0))# proportion of columns to use per level
    ,'colsample_bynode' : hp.loguniform('colsample_bynode', np.log(0.1), np.log(1.0)) # proportion of columns to use per node
    ,'scale_pos_weight' : hp.loguniform('scale_pos_weight', np.log(scale), np.log(scale * 10))   # weight to assign positive label to manage imbalance
    }

With our tuning exercise over, let's go ahead and release the replicated copies of our features and labels datasets.  This will take pressure off our cluster resources as we move forward:

Now we can examine the hyperparameter values arrived at by hyperopt:

In [0]:
# define list to hold run ids for later retrieval
run_ids = []

###Train MLPClassifer Model

Using the same techniques as shown in the last step (but omitted here for brevity), we've identified an optimal set of parameters for the training of the HistGradientBoostingClassifier model.  We can now perform a final training run for this model:

Having done the same for our neural network, we will train it now:

In [0]:
# optimal param settings
params = {
  'activation': 'logistic',
  'hidden_layer_1': 100.0,
  'hidden_layer_2': 35.0,
  'hidden_layer_cutoff': 15,
  'learning_rate': 'adaptive',
  'learning_rate_init': 0.3424456484117518,
  'solver': 'sgd'
   }

# train model based on these params
with mlflow.start_run(run_name='MLP Final Model') as run:
  
  # capture run info for later use
  run_id = run.info.run_id
  run_name = run.data.tags['mlflow.runName']
  run_ids += [(run_name, run_id)]
  
  mlflow.log_params(params)
  
  # hidden layer definitions
  hidden_layer_1 = int(params.pop('hidden_layer_1'))
  hidden_layer_2 = int(params.pop('hidden_layer_2'))
  hidden_layer_cutoff = int(params.pop('hidden_layer_cutoff'))
  if hidden_layer_2 > hidden_layer_cutoff:
    hidden_layer_sizes = (hidden_layer_1, hidden_layer_2)
  else:
    hidden_layer_sizes = (hidden_layer_1)
  params['hidden_layer_sizes']=hidden_layer_sizes
  
  # train
  model = MLPClassifier(max_iter=10, **params)
  model.fit(X_train, y_train)
  mlflow.sklearn.log_model(model, 'model')
  
  # predict
  y_prob = model.predict_proba(X_test)
  
  # score
  model_ap = average_precision_score(y_test, y_prob[:,1])
  mlflow.log_metric('avg precision', model_ap)
  
  print('Model logged under run_id "{0}" with AP Score of {1:.5f}'.format(run_id, model_ap))

In [0]:
models = []

# for each final training run, retreive its model from mlflow 
for run_id in run_ids:
  models += [(run_id[0], mlflow.sklearn.load_model('runs:/{0}/model'.format(run_id[1])))] 

models

In [0]:
print(model)

###Step 5: Persist Model Pipeline

We now have an optimized model, trained and persisted.  However, it is built to expect pre-transformed data.  If we consider how we will use this model to make predictions for the business, it would be helpful to combine our data transformation steps, addressed at the top of this notebook, with our model so that untransformed feature data can be passed directly to it.  To tackle this, we'll take our ColumnTransformers defined earlier and our Voting Classifier model trained in the last step and combine them into a unified model pipeline:

In [0]:
# assemble pipeline
model_pipeline = Pipeline([
   ('impute', impute),
   ('encode_scale', encode_scale),
   ('MLPClassifier', model)
   ])

In [0]:
type(model_pipeline)

When defining a pipeline, we'd typically call the *fit()* method on it to train each of the steps, but each step has already been trained at various points in this notebook so that we can move directly into predictions.  To verify our pipeline works as expected, let's pass it our **raw** test data and calculate our evaluation metric to verify it is the same as observed in the last step:

In [0]:
# predict
y_prob = model_pipeline.predict_proba(X_test_raw)
  
# score
model_ap = average_precision_score(y_test, y_prob[:,1])

print('AP score: {0:.5f}'.format(model_ap))

Everything looks good.  We're just about ready to save this model for later reuse, but there's one last challenge we need to overcome.

We will be saving this model via mlflow and possibly registering it in Spark as a pandas UDF.  The default deployment of such a function in mlflow maps the pandas UDF to the model's *predict()* method.  As you may remember in our last notebook, the *predict()* method returns a class prediction of 0 or 1 based on a 50% probability threshold.  If we want to ensure our model returns the actual positive class probability when registered with any of the mlflow serving mechanisms, we'll need to write a customer wrapper that overrides the *predict()* method:

In [0]:
# shamelessly stolen from https://docs.databricks.com/_static/notebooks/mlflow/mlflow-end-to-end-example-aws.html

class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
  
  def __init__(self, model):
    self.model = model
    
  def predict(self, context, model_input):
    return self.model.predict_proba(model_input)[:,1]

Now we can persist our model, making sure we include our custom wrapper in the call:

In [0]:
with mlflow.start_run(run_name='Final Pipeline Model') as run:
  
  run_id = run.info.run_id
  
  # record the score with this model
  mlflow.log_metric('avg precision', model_ap)
  
  # persist the model with the custom wrapper
  #wrappedModel = SklearnModelWrapper(model_pipeline)
  mlflow.sklearn.log_model(model_pipeline, 'model')
  #mlflow.pyfunc.log_model(
  #  artifact_path='model', 
  #  python_model=model_pipeline
  #  )
  
print('Model logged under run_id "{0}" with log loss of {1:.5f}'.format(run_id, model_ap))

In [0]:
run_id

In [0]:
type(model_pipeline) /eca012fc56e44bbfa37bb88e1379ef06

In [0]:
modelLoaded = mlflow.sklearn.load_model("runs:/"+run_id+"/model")

In [0]:
modelLoaded

###Register the model in Azure Machine Learning

In [0]:
import mlflow.azureml
from azureml.core import Workspace
from azureml.core.webservice import AciWebservice, Webservice

# Load or create an Azure ML Workspace
workspace_name = "NAME OF YOUR AML WORKSPACE"
subscription_id =  dbutils.secrets.get(scope="YOUR SECRET",key="YOUR SUBSCRIPTION ID")
resource_group = "YOUR RESOURCE GROUP"
location = "YOUR REGION"
azure_workspace = Workspace.create(name=workspace_name,
                                   subscription_id=subscription_id,
                                   resource_group=resource_group,
                                   location=location,
                                   create_resource_group=True,
                                   exist_ok=True)

In [0]:
webservice_deployment_config = AciWebservice.deploy_configuration()

In [0]:
(webservice, model) = mlflow.azureml.deploy(model_uri="runs:/"+run_id+"/model",
                                            workspace=azure_workspace,
                                            model_name='YOUR MODEL NAME',
                                            service_name='YOUR SERVICE NAME',
                                            deployment_config=webservice_deployment_config)

In [0]:
print("scoring uri is {0}".format(webservice.scoring_uri))

In [0]:
import requests
import json
# `sample_input` is a JSON-serialized pandas DataFrame with the `split` orientation
sample_input = {
    "columns": [
       "start_year", "start_month", "subscription_age", "renewals",
       "total_list_price", "total_amount_paid", "total_discount",
       "days_since_last_account_action", "last_plan_list_price",
       "last_actual_amount_paid", "last_discount", "last_payment_plan_days",
       "last_payment_method", "last_is_cancel", "last_is_auto_renew",
       "last_change_in_list_price", "last_change_in_discount",
       "last_change_in_payment_plan_days", "last_change_in_payment_method_id",
       "last_change_in_cancellation", "last_change_in_auto_renew",
       "last_days_change_in_membership_expire_date", "days_until_expiration",
       "total_subscription_count", "city", "bd", "gender", "registered_via",
       "days_total", "days_with_session", "ratio_days_with_session_to_days",
       "days_after_exp", "days_after_exp_with_session",
       "ratio_days_after_exp_with_session_to_days_after_exp", "sessions_total",
       "ratio_sessions_total_to_days_total",
       "ratio_sessions_total_to_days_with_session", "sessions_total_after_exp",
       "ratio_sessions_total_after_exp_to_days_after_exp",
       "ratio_sessions_total_after_exp_to_days_after_exp_with_session",
       "seconds_total", "ratio_seconds_total_to_days_total",
       "ratio_seconds_total_to_days_with_session", "seconds_total_after_exp",
       "ratio_seconds_total_after_exp_to_days_after_exp",
       "ratio_seconds_total_after_exp_to_days_after_exp_with_session",
       "number_uniq", "ratio_number_uniq_to_days_total",
       "ratio_number_uniq_to_days_with_session", "number_uniq_after_exp",
       "ratio_number_uniq_after_exp_to_days_after_exp",
       "ratio_number_uniq_after_exp_to_days_after_exp_with_session",
       "number_total", "ratio_number_total_to_days_total",
       "ratio_number_total_to_days_with_session", "number_total_after_exp",
       "ratio_number_total_after_exp_to_days_after_exp",
       "ratio_number_total_after_exp_to_days_after_exp_with_session"
    ],
    "data": [
       [2015,1,762,25,3496,3625,-129, 16, 149,149,0,30,41,0,1,0,0,0,0,0,0,28,-12,1,1,47,1,7,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    ]
}


response = requests.post(
              url=webservice.scoring_uri, data=json.dumps(sample_input),
              headers={"Content-type": "application/json"})

response_json = json.loads(response.text)
print(response_json)
print(response)

In [0]:
from azureml.core import Webservice

service = Webservice(azure_workspace, 'YOUR AML WORKSPACE NAME')
print(service.get_logs())