# Model Training (and tracking)
We are going to train two models and track both of the experiments using MLflow!

### Import required libraries

In [1]:
# Install imblearn for SMOTE using pip
%pip install imblearn
import mlflow
import pandas as pd
from collections import Counter
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

StatementMeta(, , , Finished, )

InvalidHttpRequestToLivy: [TooManyRequestsForCapacity] This spark job can't be run because you have hit a spark compute or API rate limit. To run this spark job, cancel an active Spark job through the Monitoring hub, choose a larger capacity SKU, or try again later. HTTP status code: 430 {Learn more} HTTP status code: 430.

### Read the Data & Create Experiment
We will start by reading the data we have 'wrangled' from the delta table, and creating an MLflow experiment

In [2]:
SEED = 12345
df_clean = spark.read.format("delta").load("Tables/df_clean").toPandas()

EXPERIMENT_NAME = "bank-churn-experiment"  # MLflow experiment name
try:
    mlflow.set_experiment(EXPERIMENT_NAME)
except Exception as e:
    print(e)
mlflow.autolog(exclusive=False)

print("Complete")

StatementMeta(, 1b1d96a6-3a82-4376-9f35-d0531b1e07a6, 10, Finished, Available)

2024/02/02 15:14:56 INFO mlflow.tracking.fluent: Experiment with name 'bank-churn-experiment' does not exist. Creating a new experiment.
2024/02/02 15:14:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2024/02/02 15:14:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Complete


### Prepare training data
We will do the standard train/test/validation split for this example. Then we will save the test dataframe for 'later use'...

In [5]:
y = df_clean["Exited"]
X = df_clean.drop("Exited",axis=1)
# Split the dataset to 60%, 20%, 20% for training, validation, and test datasets
# Train-Test Separation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)
# Train-Validation Separation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=SEED)

table_name = "df_test"
# Create PySpark DataFrame from Pandas
df_test=spark.createDataFrame(X_test)
df_test.write.mode("overwrite").format("delta").save(f"Tables/{table_name}")
print(f"Spark test DataFrame saved to delta table: {table_name}")

StatementMeta(, 1b1d96a6-3a82-4376-9f35-d0531b1e07a6, 13, Finished, Available)

Spark test DataFrame saved to delta table: df_test


### Synthesise Minority Class
Only around 20% of the the records are for customers who have left, so we will synthesise a little bit more!

In [6]:
sm = SMOTE(random_state=SEED)
X_res, y_res = sm.fit_resample(X_train, y_train)
new_train = pd.concat([X_res, y_res], axis=1)

StatementMeta(, 1b1d96a6-3a82-4376-9f35-d0531b1e07a6, 14, Finished, Available)

2024/02/02 15:22:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '07eb30aa-5423-4f89-a40b-5244f6e5477a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


### Training some Models
Now we will train 3 models, two Random Forests with different parameters, and a LightGBM

In [8]:
mlflow.sklearn.autolog(registered_model_name='rfc1_sm') # Register the trained model with autologging
rfc1_sm = RandomForestClassifier(max_depth=4, max_features=4, min_samples_split=3, random_state=1) # Pass hyperparameters
with mlflow.start_run(run_name="rfc1_sm") as run:
    rfc1_sm_run_id = run.info.run_id # Capture run_id for model prediction later
    print("run_id: {}; status: {}".format(rfc1_sm_run_id, run.info.status))
    # rfc1.fit(X_train,y_train) # Imbalanaced training data
    rfc1_sm.fit(X_res, y_res.ravel()) # Balanced training data
    rfc1_sm.score(X_val, y_val)
    y_pred = rfc1_sm.predict(X_val)
    cr_rfc1_sm = classification_report(y_val, y_pred)
    cm_rfc1_sm = confusion_matrix(y_val, y_pred)
    roc_auc_rfc1_sm = roc_auc_score(y_res, rfc1_sm.predict_proba(X_res)[:, 1])

mlflow.sklearn.autolog(registered_model_name='rfc2_sm') # Register the trained model with autologging
rfc2_sm = RandomForestClassifier(max_depth=8, max_features=6, min_samples_split=3, random_state=1) # Pass hyperparameters
with mlflow.start_run(run_name="rfc2_sm") as run:
    rfc2_sm_run_id = run.info.run_id # Capture run_id for model prediction later
    print("run_id: {}; status: {}".format(rfc2_sm_run_id, run.info.status))
    # rfc2.fit(X_train,y_train) # Imbalanced training data
    rfc2_sm.fit(X_res, y_res.ravel()) # Balanced training data
    rfc2_sm.score(X_val, y_val)
    y_pred = rfc2_sm.predict(X_val)
    cr_rfc2_sm = classification_report(y_val, y_pred)
    cm_rfc2_sm = confusion_matrix(y_val, y_pred)
    roc_auc_rfc2_sm = roc_auc_score(y_res, rfc2_sm.predict_proba(X_res)[:, 1])

# lgbm_model
mlflow.lightgbm.autolog(registered_model_name='lgbm_sm') # Register the trained model with autologging
lgbm_sm_model = LGBMClassifier(learning_rate = 0.07, 
                        max_delta_step = 2, 
                        n_estimators = 100,
                        max_depth = 10, 
                        eval_metric = "logloss", 
                        objective='binary', 
                        random_state=42)

with mlflow.start_run(run_name="lgbm_sm") as run:
    lgbm1_sm_run_id = run.info.run_id # Capture run_id for model prediction later
    # lgbm_sm_model.fit(X_train,y_train) # Imbalanced training data
    lgbm_sm_model.fit(X_res, y_res.ravel()) # Balanced training data
    y_pred = lgbm_sm_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    cr_lgbm_sm = classification_report(y_val, y_pred)
    cm_lgbm_sm = confusion_matrix(y_val, y_pred)
    roc_auc_lgbm_sm = roc_auc_score(y_res, lgbm_sm_model.predict_proba(X_res)[:, 1])

StatementMeta(, 1b1d96a6-3a82-4376-9f35-d0531b1e07a6, 16, Finished, Available)

Registered model 'rfc1_sm' already exists. Creating a new version of this model...
2024/02/02 15:26:53 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: rfc1_sm, version 2
Created version '2' of model 'rfc1_sm'.


run_id: 63229277-2910-4336-aad1-338183f91fa7; status: RUNNING


Registered model 'rfc2_sm' already exists. Creating a new version of this model...
2024/02/02 15:27:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: rfc2_sm, version 2
Created version '2' of model 'rfc2_sm'.


run_id: ffa48a75-18fa-4638-8d6f-3e7a94d62103; status: RUNNING


Registered model 'lgbm_sm' already exists. Creating a new version of this model...
2024/02/02 15:27:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: lgbm_sm, version 2
Created version '2' of model 'lgbm_sm'.


[LightGBM] [Info] Number of positive: 4804, number of negative: 4804
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2125
[LightGBM] [Info] Number of data points in the train set: 9608, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
