
# Model Training

In [0]:
import pandas as pd
import warnings

from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder

from databricks.automl_runtime.sklearn import OneHotEncoder
from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
from databricks.feature_engineering import FeatureEngineeringClient
from databricks.feature_store import FeatureFunction, FeatureLookup

import lightgbm
from lightgbm import LGBMClassifier

import mlflow
from mlflow import pyfunc
from mlflow.models import Model, infer_signature, ModelSignature
from mlflow.pyfunc import PyFuncModel
from mlflow.types.utils import _infer_schema
from mlflow.exceptions import MlflowException

from hyperopt import hp, tpe, fmin, STATUS_OK, Trials

from shap import KernelExplainer, summary_plot

In [0]:
# set model experiment name and path
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()
mlflow.set_registry_uri("databricks-uc")
xp_name = "dbdemos_mlops_churn_demo_experiment"
xp_path = f"/Users/{current_user}/dbdemos_mlops"

mlflow.set_experiment(f"{xp_path}/{xp_name}")

In [0]:
%%sql

use catalog main;

use schema dbdemos_mlops;

**Create Feature Specifications**

In [0]:
features = [
    FeatureLookup(
      table_name="advanced_churn_feature_table",
      lookup_key=["customer_id"],
      timestamp_lookup_key="transaction_ts"
    ),
    FeatureFunction(
      udf_name="avg_price_increase",
      input_bindings={
        "monthly_charges_in" : "monthly_charges",
        "tenure_in" : "tenure",
        "total_charges_in" : "total_charges"
      },
      output_name="avg_price_increase"
    )
]

labels_df = spark.table('advanced_churn_label_table')
label_col = 'churn'

In [0]:
fe = FeatureEngineeringClient()

training_set_specs = fe.create_training_set(
  df=labels_df, 
  label="churn",
  feature_lookups=features,
  exclude_columns=["customer_id", "transaction_ts", 'split']
)

df_loaded = training_set_specs.load_df().toPandas()

supported_cols = ["online_backup", "internet_service", "payment_method", "multiple_lines", "paperless_billing", "partner", "tech_support", "tenure", "contract", "avg_price_increase", "phone_service", "streaming_movies", "dependents", "senior_citizen", "num_optional_services", "device_protection", "monthly_charges", "total_charges", "streaming_tv", "gender", "online_security"]

col_selector = ColumnSelector(supported_cols)

## Preprocessors

In [0]:
# Boolean features
bool_imputers = []
bool_pipeline = Pipeline([
    ("cast_type", FunctionTransformer(lambda df: df.astype(object))),
    ("imputers", ColumnTransformer(bool_imputers, remainder="passthrough")),
    ("onehot", SklearnOneHotEncoder(handle_unknown="ignore", drop="first")),
])

# Numerical features
num_imputers = []
num_imputers.append(("impute_mean", SimpleImputer(), ["avg_price_increase", "monthly_charges", "num_optional_services", "tenure", "total_charges"]))

numerical_pipeline = Pipeline([
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors='coerce'))),
    ("imputers", ColumnTransformer(num_imputers)),
    ("standardizer", StandardScaler()),
])

# Categorical features
one_hot_imputers = []
one_hot_pipeline = Pipeline(steps=[
    ("imputers", ColumnTransformer(one_hot_imputers, remainder="passthrough")),
    ("one_hot_encoder", OneHotEncoder(handle_unknown="indicator")),
])


bool_transformers = [("boolean", bool_pipeline, ["gender", "phone_service", "dependents", "senior_citizen", "paperless_billing", "partner"])]
numerical_transformers = [("numerical", numerical_pipeline, ["monthly_charges", "total_charges", "avg_price_increase", "tenure", "num_optional_services"])]
categorical_one_hot_transformers = [("onehot", one_hot_pipeline, ["contract", "device_protection", "internet_service", "multiple_lines", "online_backup", "online_security", "payment_method", "streaming_movies", "streaming_tv", "tech_support"])]

preprocessor = ColumnTransformer(
  transformers=bool_transformers + numerical_transformers + categorical_one_hot_transformers,
  remainder='passthrough',
  sparse_threshold=0
)

### Train - Validation - Test Split

In [0]:
X_train, X_eval, y_train, y_eval = train_test_split(
  df_loaded.drop(label_col, axis=1), 
  df_loaded[label_col], 
  test_size=0.4, 
  stratify=df_loaded[label_col], 
  random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
  X_eval, 
  y_eval, 
  test_size=0.5, 
  stratify=y_eval, 
  random_state=42
)

### Train the Classification Model

In [0]:
mlflow.sklearn.autolog(disable=True)
pipeline_val = Pipeline([
    ("column_selector", col_selector),
    ("preprocessor", preprocessor),
])
pipeline_val.fit(X_train, y_train)
X_val_processed = pipeline_val.transform(X_val)


def objective(params):
  with mlflow.start_run(run_name="mlops_best_run") as mlflow_run:
    lgbmc_classifier = LGBMClassifier(**params)

    model = Pipeline([
        ("column_selector", col_selector),
        ("preprocessor", preprocessor),
        ("classifier", lgbmc_classifier),
    ])

    mlflow.sklearn.autolog(
        log_input_examples=True,
        log_models=False,
        silent=True)

    model.fit(X_train, y_train, classifier__callbacks=[lightgbm.early_stopping(5), lightgbm.log_evaluation(0)], classifier__eval_set=[(X_val_processed,y_val)])

    # Log the model
    # Infer output schema
    try:
      output_schema = _infer_schema(y_train)
    except Exception as e:
      warnings.warn(f"Could not infer model output schema: {e}")
      output_schema = None
    
    # use the Feature Engineering client to log the model
    # this logs the feature specifications along with the model,
    # allowing it to be used at inference time to retrieve features
    fe.log_model(
        model=model,
        artifact_path="model",
        flavor=mlflow.sklearn,
        training_set=training_set_specs,
        output_schema=output_schema,
        extra_pip_requirements=["databricks-feature-lookup>=1.5.0"]
    )

    # log metrics for the training, validation and test set
    mlflow_model = Model()
    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.sklearn")
    pyfunc_model = PyFuncModel(model_meta=mlflow_model, model_impl=model)
    training_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_train.assign(**{str(label_col):y_train}),
        targets=label_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "training_" , "pos_label": "Yes" }
    )
    lgbmc_training_metrics = training_eval_result.metrics

    val_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_val.assign(**{str(label_col):y_val}),
        targets=label_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "val_" , "pos_label": "Yes" }
    )
    lgbmc_val_metrics = val_eval_result.metrics

    test_eval_result = mlflow.evaluate(
        model=pyfunc_model,
        data=X_test.assign(**{str(label_col):y_test}),
        targets=label_col,
        model_type="classifier",
        evaluator_config = {"log_model_explainability": False,
                            "metric_prefix": "test_" , "pos_label": "Yes" }
    )
    lgbmc_test_metrics = test_eval_result.metrics

    loss = -lgbmc_val_metrics["val_f1_score"]

    # truncate metric key names so they can be displayed together
    lgbmc_val_metrics = {k.replace("val_", ""): v for k, v in lgbmc_val_metrics.items()}
    lgbmc_test_metrics = {k.replace("test_", ""): v for k, v in lgbmc_test_metrics.items()}

    return {
      "loss": loss,
      "status": STATUS_OK,
      "val_metrics": lgbmc_val_metrics,
      "test_metrics": lgbmc_test_metrics,
      "model": model,
      "run": mlflow_run,
    }

### Configure the Hyperparameter search space

In [0]:
# define the hyperparameter search space
space = {
  "colsample_bytree": 0.4120544919020157,
  "lambda_l1": 2.6616074270114995,
  "lambda_l2": 514.9224373768443,
  "learning_rate": 0.0778497372371143,
  "max_bin": 229,
  "max_depth": 9,
  "min_child_samples": 66,
  "n_estimators": 250,
  "num_leaves": 100,
  "path_smooth": 61.06596877554017,
  "subsample": 0.6965257092078714,
  "random_state": 42,
}

In [0]:
trials = Trials()
fmin(objective,
     space=space,
     algo=tpe.suggest,
     max_evals=1, 
     trials=trials)

best_result = trials.best_trial["result"]
model = best_result["model"]
mlflow_run = best_result["run"]

display(
  pd.DataFrame(
    [best_result["val_metrics"], best_result["test_metrics"]],
    index=["validation", "test"]))

set_config(display="diagram")
model

### Feature Importances

In [0]:
shap_enabled = True
if shap_enabled:
  mlflow.autolog(disable=True)
  mlflow.sklearn.autolog(disable=True)

  mode = X_train.mode().iloc[0]
  train_sample = X_train.sample(n=min(100, X_train.shape[0]), random_state=42).fillna(mode)
  example = X_val.sample(n=min(100, X_val.shape[0]), random_state=42).fillna(mode)

  predict = lambda x: model.predict_proba(pd.DataFrame(x, columns=X_train.columns))
  explainer = KernelExplainer(predict, train_sample, link="logit")
  shap_values = explainer.shap_values(example, l1_reg=False, nsamples=100)
  summary_plot(shap_values, example, class_names=model.classes_)

### Confusion matrix, ROC, and Precision-Recall curves for validation 

In [0]:
displayHTML(f"<a href=#mlflow/experiments/{mlflow_run.info.experiment_id}/runs/{ mlflow_run.info.run_id }/artifactPath/model> Link to model run page </a>")