# Tracking experiments on Databricks MLFlow

## Install Dependencies

In [2]:
%pip install -q mlflow databricks-sdk

Note: you may need to restart the kernel to use updated packages.


## Set Up Authentication of Databricks CE

### Use the API mlflow.login()

* Databricks Host: Use ```https://community.cloud.databricks.com/```
* Username: Your email address that signs in Databricks CE: ```datanerd07@gmail.com```
* Password: Your password of Databricks CE: ```*****```

In [34]:
import mlflow

mlflow.login()

2024/05/24 12:00:34 INFO mlflow.utils.credentials: No valid Databricks credentials found, please enter your credentials...
2024/05/24 12:01:02 INFO mlflow.utils.credentials: Successfully connected to MLflow hosted tracking server! Host: https://community.cloud.databricks.com.


In [35]:
mlflow.set_tracking_uri("databricks")

## Checking the databricks connection with a test experiment

In [36]:
mlflow.set_experiment("/check-databricks-connection")

with mlflow.start_run():
    mlflow.log_metric("foo", 1)
    mlflow.log_metric("bar", 2)

## Tracking Victor's MLmodels experiments in Databricks MLFlow

- RandomForestClassifier
- KNeighborsClassifier
- GradientBoostClassifier
- DecisionTreeClassifier
- AdaBoostClassifier

In [37]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.base import BaseEstimator
from typing import Dict, Any

def load_data(filename: str) -> pd.DataFrame:
    """
    Load data from a CSV file.
    
    Parameters:
    filename (str): The name of the CSV file to load.
    
    Returns:
    pd.DataFrame: The loaded data as a DataFrame.
    """
    script_dir = os.getcwd()
    data_path = os.path.join(script_dir, 'data', filename)
    return pd.read_csv(data_path)

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the data by selecting certain columns and dropping the CLIENTNUM column.
    
    Parameters:
    df (pd.DataFrame): The input dataframe to preprocess.
    
    Returns:
    pd.DataFrame: The preprocessed dataframe.
    """
    df = df[df.columns[:-2]]
    df = df.drop(['CLIENTNUM'], axis=1)
    return df

def save_cleaned_data(df: pd.DataFrame) -> None:
    """
    Save the cleaned data to a CSV file.
    
    Parameters:
    df (pd.DataFrame): The cleaned dataframe to save.
    
    Returns:
    None
    """
    script_dir = os.path.dirname(__file__)
    data_path = os.path.join(script_dir, '..', 'data', 'cleaned_data.csv')
    df.to_csv(data_path, index=False)

def get_model(X_train: pd.DataFrame, model: BaseEstimator) -> ImbPipeline:
    """
    Create a preprocessing and modeling pipeline.
    
    Parameters:
    X_train (pd.DataFrame): The training features.
    model (BaseEstimator): The machine learning model to be used.
    
    Returns:
    ImbPipeline: A pipeline that preprocesses the data and applies the model.
    """
    # Identifying categoricals and numericals
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X_train.select_dtypes(exclude=['object', 'category']).columns

    # Numerical preprocessing
    numerical_pipeline = make_pipeline(
        SimpleImputer(strategy='mean'),
        StandardScaler()
    )

    # Categorical preprocessing
    categorical_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown='ignore')
    )

    # ColumnTransformer 
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_pipeline, categorical_cols),
            ('num', numerical_pipeline, numerical_cols) 
        ],
    remainder='passthrough'
    )

    # A pipeline that includes the above
    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

    return pipeline

def train_model(model_pipe: ImbPipeline, X_train: pd.DataFrame, y_train: pd.Series) -> ImbPipeline:
    """
    Train the model pipeline.
    
    Parameters:
    model_pipe (ImbPipeline): The pipeline containing preprocessing and model.
    X_train (pd.DataFrame): The training features.
    y_train (pd.Series): The training labels.
    
    Returns:
    ImbPipeline: The trained model pipeline.
    """
    model_pipe.fit(X_train, y_train)
    return model_pipe

def evaluate_model(trained_model: ImbPipeline, X_test: pd.DataFrame, y_test: pd.Series) -> Dict[str, Any]:
    """
    Evaluate the trained model.
    
    Parameters:
    trained_model (ImbPipeline): The trained model pipeline.
    X_test (pd.DataFrame): The test features.
    y_test (pd.Series): The test labels.
    
    Returns:
    Dict[str, Any]: A dictionary containing evaluation metrics and predictions.
    """
    y_pred = trained_model.predict(X_test)
    y_prob = trained_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = (y_pred == y_test).mean()
    roc_auc = roc_auc_score(y_test, y_prob)
    report = classification_report(y_test, y_pred)

    return {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'classification_report': report,
        'y_pred': y_pred
    }

In [39]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
from sklearn.model_selection import cross_validate, train_test_split
from mlflow.models.signature import infer_signature
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from typing import Tuple

import matplotlib.pyplot as plt

mlflow.set_experiment("/Victor Models Experiment")

# Load and preprocess data
filename = 'BankChurners.csv'
df = load_data(filename)
df = preprocess_data(df)

def split_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Split the data into train and test sets

    Parameters:
    df (pd.DataFrame): The input dataframe containing the data.
    
    Returns:
    Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: The training and testing data and labels.
    """
    X = df.drop('Attrition_Flag', axis=1)
    y = df['Attrition_Flag']
    return train_test_split(X, y, test_size=0.2, random_state=42)

def log_model_and_metrics(trained_model: Any, X_train: pd.DataFrame, y_train: pd.Series, evaluation_results: Dict[str, Any], model_type: str) -> None:
    """
    Logs the trained model and evaluation metrics.

    Parameters:
    trained_model (Any): The trained machine learning model.
    X_train (pd.DataFrame): The features used for training the model.
    y_train (pd.Series): The target variable used for training the model.
    evaluation_results (Dict[str, Any]): A dictionary containing evaluation metrics.
    model_type (str): The type of the model.

    Returns:
    None
    """
    # Log the model
    model_signature = infer_signature(X_train, y_train, params={'model_type': model_type})
    print(model_signature.to_dict())
    mlflow.sklearn.log_model(
        sk_model=trained_model,
        artifact_path="trained_model",
        conda_env=None,
        signature=model_signature
        # registered_model_name="random_forest_classifier_trained"
    )

    # Log metrics
    mlflow.log_metric('accuracy', evaluation_results['accuracy'])
    mlflow.log_metric('roc_auc', evaluation_results['roc_auc'])
    
    with open("metrics/classification_report.txt", "w") as f:
        f.write(evaluation_results['classification_report'])
    mlflow.log_artifact("metrics/classification_report.txt", artifact_path="metrics")

def save_and_log_plots(trained_model: Any, X_test: pd.DataFrame, y_test: pd.Series) -> None:
    """
    Saves and logs ROC and Precision-Recall curves.

    Parameters:
    trained_model (Any): The trained machine learning model.
    X_test (pd.DataFrame): The features used for testing the model.
    y_test (pd.Series): The target variable used for testing the model.

    Returns:
    None
    """
    roc_display = RocCurveDisplay.from_estimator(trained_model, X_test, y_test)
    plt.title("ROC Curve")
    plt.savefig("metrics/roc_curve.png")
    plt.close()
    mlflow.log_artifact("metrics/roc_curve.png", artifact_path="metric_graphs")

    pr_display = PrecisionRecallDisplay.from_estimator(trained_model, X_test, y_test)
    plt.title("Precision-Recall Curve")
    plt.legend()
    plt.savefig("metrics/precision_recall_curve.png")
    plt.close()
    mlflow.log_artifact("metrics/precision_recall_curve.png", artifact_path="metric_graphs")

def cross_validate_model(trained_model: Any, X_train: pd.DataFrame, y_train: pd.Series) -> None:
    """
    Performs cross-validation and prints mean accuracy, fit time, and score time.

    Parameters:
    trained_model (Any): The trained machine learning model.
    X_train (pd.DataFrame): The features used for training the model.
    y_train (pd.Series): The target variable used for training the model.

    Returns:
    None
    """
    cv_results = cross_validate(trained_model, X_train, y_train, cv=5, scoring='accuracy', return_train_score=True)
    print("Mean Test Accuracy:", round(cv_results['test_score'].mean(), 3))
    print("Mean Train Accuracy:", round(cv_results['train_score'].mean(), 3))
    print("Mean Fit Time:", round(cv_results['fit_time'].mean(), 3))
    print("Mean Score Time:", round(cv_results['score_time'].mean(), 3))

# -- Split the data into train and test sets
X_train, X_test, y_train, y_test = split_data(df)

def mlflow_experiment_run(run_name: str, model_name: str, ml_model: Any, model_type: str) -> None:
    """
    Run an MLflow experiment.

    This function trains a machine learning model, logs metrics and artifacts to MLflow, and performs model evaluation.

    Parameters:
    run_name (str): The name of the MLflow run.
    model_name (str): The name of the model.
    ml_model (Any): The machine learning model to be trained.
    model_type (str): The type of the model.

    Returns:
    None
    """
    with mlflow.start_run(run_name=run_name) as run:
        print('--------------------')
        print(model_name)
        print('--------------------')
        # -- Get the model
        model = get_model(X_train,ml_model)
        
        # -- Train the model
        trained_model = train_model(model, X_train, y_train)
        train_score = round(trained_model.score(X_train, y_train), 3)

        mlflow.log_metric('train_score', train_score)
        print("train_score:", train_score)

        evaluation_results = evaluate_model(trained_model, X_test, y_test)
        log_model_and_metrics(trained_model, X_train, y_train, evaluation_results, model_type)

        test_score = round(trained_model.score(X_test, y_test), 2)
        print("test_score:", test_score)

        print("accuracy:", round(evaluation_results['accuracy'], 3))
        print("roc_auc:", round(evaluation_results['roc_auc'], 3))
        print("classification_report:", evaluation_results['classification_report'])

        save_and_log_plots(trained_model, X_test, y_test)
        cross_validate_model(trained_model, X_train, y_train)

mlflow_experiment_run('RandomForestClassifier', 'RANDOMFORESTCLASSIFIER', RandomForestClassifier(), 'RandomForest')
mlflow_experiment_run('KNeighborsClassifier', 'KNEIGHBORSCLASSIFIER', KNeighborsClassifier(), 'Kneigbors')
mlflow_experiment_run('GradientBoostClassifier', 'GRADIENTBOOSTCLASSIFIER', GradientBoostingClassifier(), 'GradientBoost')
mlflow_experiment_run('DecisionTreeClassifier', 'DECISIONTREECLASSIFIER', DecisionTreeClassifier(), 'DecisionTree')
mlflow_experiment_run('AdaBoostClassifier', 'ADABOOSTCLASSIFIER', AdaBoostClassifier(), 'AdaBoost')

--------------------
RANDOMFORESTCLASSIFIER
--------------------
train_score: 1.0




{'inputs': '[{"type": "long", "name": "Customer_Age", "required": true}, {"type": "string", "name": "Gender", "required": true}, {"type": "long", "name": "Dependent_count", "required": true}, {"type": "string", "name": "Education_Level", "required": true}, {"type": "string", "name": "Marital_Status", "required": true}, {"type": "string", "name": "Income_Category", "required": true}, {"type": "string", "name": "Card_Category", "required": true}, {"type": "long", "name": "Months_on_book", "required": true}, {"type": "long", "name": "Total_Relationship_Count", "required": true}, {"type": "long", "name": "Months_Inactive_12_mon", "required": true}, {"type": "long", "name": "Contacts_Count_12_mon", "required": true}, {"type": "double", "name": "Credit_Limit", "required": true}, {"type": "long", "name": "Total_Revolving_Bal", "required": true}, {"type": "double", "name": "Avg_Open_To_Buy", "required": true}, {"type": "double", "name": "Total_Amt_Chng_Q4_Q1", "required": true}, {"type": "long



{'inputs': '[{"type": "long", "name": "Customer_Age", "required": true}, {"type": "string", "name": "Gender", "required": true}, {"type": "long", "name": "Dependent_count", "required": true}, {"type": "string", "name": "Education_Level", "required": true}, {"type": "string", "name": "Marital_Status", "required": true}, {"type": "string", "name": "Income_Category", "required": true}, {"type": "string", "name": "Card_Category", "required": true}, {"type": "long", "name": "Months_on_book", "required": true}, {"type": "long", "name": "Total_Relationship_Count", "required": true}, {"type": "long", "name": "Months_Inactive_12_mon", "required": true}, {"type": "long", "name": "Contacts_Count_12_mon", "required": true}, {"type": "double", "name": "Credit_Limit", "required": true}, {"type": "long", "name": "Total_Revolving_Bal", "required": true}, {"type": "double", "name": "Avg_Open_To_Buy", "required": true}, {"type": "double", "name": "Total_Amt_Chng_Q4_Q1", "required": true}, {"type": "long



{'inputs': '[{"type": "long", "name": "Customer_Age", "required": true}, {"type": "string", "name": "Gender", "required": true}, {"type": "long", "name": "Dependent_count", "required": true}, {"type": "string", "name": "Education_Level", "required": true}, {"type": "string", "name": "Marital_Status", "required": true}, {"type": "string", "name": "Income_Category", "required": true}, {"type": "string", "name": "Card_Category", "required": true}, {"type": "long", "name": "Months_on_book", "required": true}, {"type": "long", "name": "Total_Relationship_Count", "required": true}, {"type": "long", "name": "Months_Inactive_12_mon", "required": true}, {"type": "long", "name": "Contacts_Count_12_mon", "required": true}, {"type": "double", "name": "Credit_Limit", "required": true}, {"type": "long", "name": "Total_Revolving_Bal", "required": true}, {"type": "double", "name": "Avg_Open_To_Buy", "required": true}, {"type": "double", "name": "Total_Amt_Chng_Q4_Q1", "required": true}, {"type": "long



{'inputs': '[{"type": "long", "name": "Customer_Age", "required": true}, {"type": "string", "name": "Gender", "required": true}, {"type": "long", "name": "Dependent_count", "required": true}, {"type": "string", "name": "Education_Level", "required": true}, {"type": "string", "name": "Marital_Status", "required": true}, {"type": "string", "name": "Income_Category", "required": true}, {"type": "string", "name": "Card_Category", "required": true}, {"type": "long", "name": "Months_on_book", "required": true}, {"type": "long", "name": "Total_Relationship_Count", "required": true}, {"type": "long", "name": "Months_Inactive_12_mon", "required": true}, {"type": "long", "name": "Contacts_Count_12_mon", "required": true}, {"type": "double", "name": "Credit_Limit", "required": true}, {"type": "long", "name": "Total_Revolving_Bal", "required": true}, {"type": "double", "name": "Avg_Open_To_Buy", "required": true}, {"type": "double", "name": "Total_Amt_Chng_Q4_Q1", "required": true}, {"type": "long



train_score: 0.948




{'inputs': '[{"type": "long", "name": "Customer_Age", "required": true}, {"type": "string", "name": "Gender", "required": true}, {"type": "long", "name": "Dependent_count", "required": true}, {"type": "string", "name": "Education_Level", "required": true}, {"type": "string", "name": "Marital_Status", "required": true}, {"type": "string", "name": "Income_Category", "required": true}, {"type": "string", "name": "Card_Category", "required": true}, {"type": "long", "name": "Months_on_book", "required": true}, {"type": "long", "name": "Total_Relationship_Count", "required": true}, {"type": "long", "name": "Months_Inactive_12_mon", "required": true}, {"type": "long", "name": "Contacts_Count_12_mon", "required": true}, {"type": "double", "name": "Credit_Limit", "required": true}, {"type": "long", "name": "Total_Revolving_Bal", "required": true}, {"type": "double", "name": "Avg_Open_To_Buy", "required": true}, {"type": "double", "name": "Total_Amt_Chng_Q4_Q1", "required": true}, {"type": "long



Mean Test Accuracy: 0.943
Mean Train Accuracy: 0.949
Mean Fit Time: 1.09
Mean Score Time: 0.014
