In [None]:
import os
import mlflow
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product

# Sklearn modules
from sklearn.model_selection import train_test_split, ParameterGrid, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline


# if git is not installed in docker container
os.environ['GIT_PYTHON_REFRESH'] = 'quiet'

# silence warnings
warnings.filterwarnings('ignore')

# Help functions

### Metrics related code

In [None]:
def plot_cv_metrics(cv_metrics: list[dict]) -> plt.Figure:
    """
    Plots cross-validation metrics.

    Parameters:
        cv_metrics (list[dict]): A list of dictionaries containing cross-validation metrics.
        
    Returns:
        fig (plt.Figure): The generated matplotlib figure.
    """
    # Set plot style to 'fivethirtyeight'
    with plt.style.context(style='fivethirtyeight'):
        # Calculate the number of rows needed for subplots
        rows_needed = int(np.ceil(len(cv_metrics) / 2))
        
        # Create a subplot figure with the desired dimensions
        fig, ax = plt.subplots(rows_needed, 2, figsize=(15, rows_needed * 3))
        
        # Iterate over each metric in cv_metrics
        for index, metric in enumerate(cv_metrics):
            # Extract y values for the current metric
            y_values = cv_metrics[metric]
            
            # Generate x values for plotting
            x_values = np.arange(len(y_values))
    
            # Plot the metric on the corresponding subplot
            ax[index // 2, index % 2].plot(x_values, y_values) 
            
            # Set title for the subplot
            ax[index // 2, index % 2].set_title(metric) 
    
        # Adjust subplot layout for better spacing
        plt.tight_layout()
        
        # Close the figure to release memory
        plt.close(fig)

    # Return the generated figure
    return fig


def compute_metrics(y_true: pd.Series, y_pred: pd.Series, metrics: list, decimals: int = 3, prefix: str = '') -> dict:
    """
    Computes specified metrics between true and predicted values.

    Parameters:
        y_true (pd.Series): Series of true target values.
        y_pred (pd.Series): Series of predicted target values.
        metrics (list): List of dictionaries containing metric information. 
                        Each dictionary should have 'name' and 'function' keys.
        decimals (int): Number of decimal places to round the metric values to. Default is 3.
        prefix (str): Prefix to add to the metric names in the result dictionary. Default is an empty string.

    Returns:
        dict: A dictionary containing computed metrics.
    """
    # Comprehensively compute each metric for y_true and y_pred, rounding to specified decimal places
    return {f"{prefix}{metric['name']}": round(metric['function'](y_true, y_pred), decimals) for metric in metrics}


def compute_aggregated_metrics(cv_metrics: dict, decimals: int = 3) -> dict:
    """
    Computes aggregated statistics for cross-validation metrics.

    Parameters:
        cv_metrics (dict): Dictionary containing cross-validation metrics.
        decimals (int): Number of decimal places to round the aggregated statistics to. Default is 3.

    Returns:
        dict: A dictionary containing aggregated statistics (mean, standard deviation, median) for each metric.
    """
    # Initialize an empty dictionary to store aggregated statistics
    stats = {}

    # Iterate over each metric and its corresponding values
    for metric_name, values in cv_metrics.items():
        # Compute mean, standard deviation, and median for the values
        mean_value = round(np.mean(values), decimals)
        std_value = round(np.std(values), decimals)
        median_value = round(np.median(values), decimals)
        
        # Store the computed statistics in the stats dictionary
        stats[f'{metric_name}_mean'] = mean_value
        stats[f'{metric_name}_std'] = std_value
        stats[f'{metric_name}_median'] = median_value
    
    # Return the dictionary containing aggregated statistics
    return stats


def make_scorers_dict(metrics: list[dict]) -> dict:
    """
    Creates a dictionary of scorers from a list of metric dictionaries.

    Parameters:
        metrics (list): List of dictionaries containing metric information. 
                        Each dictionary should have 'name' and 'scorer' keys.

    Returns:
        dict: A dictionary mapping metric names to scorer functions.
    """
    # Create a dictionary comprehension to map metric names to scorer functions
    return {metric['name']: metric['scorer'] for metric in metrics}


def root_mean_squared_error(y_true: pd.Series, y_pred: pd.Series) -> float:
    """
    Calculates the root mean squared error (RMSE) between true and predicted values.

    Parameters:
        y_true (pd.Series): Series of true target values.
        y_pred (pd.Series): Series of predicted target values.

    Returns:
        float: The root mean squared error between y_true and y_pred.
    """
    # Calculate mean squared error using sklearn's mean_squared_error function
    mse = mean_squared_error(y_true, y_pred)
    
    # Return the square root of mean squared error as RMSE
    return np.sqrt(mse)



### Search space code

In [None]:
def parse_search_space(search_space: dict[list]) -> list:
    """
    Parses a dictionary representing a search space into a list of combinations.

    Parameters:
        search_space (dict): A dictionary representing the search space.
        
    Returns:
        list: A list of tuples, each containing a combination of objects and parameters.
    """
    parsed_steps = {}

    # Iterate over each step in the search space
    for step, step_objects in search_space.items():
        step_data = []
        
        # Iterate over each object within the current step
        for step_object in step_objects:
            obj = step_object.get('object')
            params = step_object.get('params')
            
            # If object exists
            if obj:
                if params:
                    # Generate combinations of object and parameters using ParameterGrid
                    step_data += [obj(**p) for p in ParameterGrid(params)]
                else:
                    # If no parameters, simply add the object
                    step_data.append(obj())
            else:
                # If object is None, append None
                step_data.append(obj)
        
        # Store parsed data for the current step
        parsed_steps[step] = step_data

    # Generate combinations of parsed steps
    return [
        tuple(zip(parsed_steps.keys(), combination)) 
        for combination in product(*parsed_steps.values())
    ]

# Modeling 

## Search space

### Description

The `search_space` dictionary represents a space of possible configurations for a machine learning pipeline, typically used in hyperparameter tuning or model selection processes. It consists of two main components: scalers and models.

#### 1. Scalers:
- The `'scaler'` key contains a list of dictionaries, each representing a scaler to be used in the pipeline.
- Each dictionary contains an `'object'` key, which refers to the scaler class to be used. If `'object'` is `None`, it indicates no scaling will be applied.
- Example scalers included are `StandardScaler` and `MinMaxScaler`.

#### 2. Models:
- The `'model'` key contains a list of dictionaries, each representing a machine learning model along with its hyperparameters.
- Each dictionary contains an `'object'` key, referring to the model class to be used, and a `'params'` key, which holds a dictionary of hyperparameters and their corresponding values to be explored.
- Example models included are `RandomForestRegressor` and `ElasticNet`.
- For `RandomForestRegressor`, hyperparameters such as `'n_estimators'` and `'max_depth'` are specified with a range of values to be explored.
- For `ElasticNet`, hyperparameters such as `'alpha'` and `'l1_ratio'` are specified with arrays of values to be explored.

Overall, `search_space` encapsulates a range of possible configurations for a machine learning pipeline, including different scalers and models with various hyperparameter settings.


In [None]:
search_space = {
    'scaler': [
        {
            'object': None
        },
        {
            'object': StandardScaler
        },
        {
            'object': MinMaxScaler
        }
    ],
    'model': [
        {
            'object': RandomForestRegressor,
            'params': {
                'n_estimators': [10, 100, 1000],
                'max_depth': [10, None]
            }
        },
        {
            'object': ElasticNet,
            'params': {
                'alpha': np.arange(0, 1, 0.5),
                'l1_ratio': np.arange(0, 1, 0.5)
            }
        }
    ]
}

## Metrics

The `metrics` list contains dictionaries, each representing a metric used to evaluate model performance. Each dictionary consists of the following keys:

Each metric has an associated function to compute it and a scorer created using `make_scorer`. The `greater_is_better` parameter specifies whether higher values of the metric indicate better performance.


In [None]:
metrics = [
    {
        'name': 'mean_absolute_error',
        'function': mean_absolute_error,
        'scorer': make_scorer(mean_absolute_error, greater_is_better=False)
    },
    {
        'name': 'mean_squared_error',
        'function': mean_squared_error,
        'scorer': make_scorer(mean_squared_error, greater_is_better=False)
    },
    {
        'name': 'root_mean_squared_error',
        'function': root_mean_squared_error,
        'scorer': make_scorer(root_mean_squared_error, greater_is_better=False)
    },
    {
        'name': 'r2_score',
        'function': r2_score,
        'scorer': make_scorer(r2_score, greater_is_better=True)
    },
]

## Configuration
These configurations specify settings for tracking experiments, naming the experiment, defining the model name, and specifying the path to store the model artifact.

In [None]:
TRACKING_URI = "http://tracking_server:5000"
EXPERIMENT_NAME = "regression-diabetes"
MODEL_NAME = "diabetes_model"
MODEL_ARTIFACT_PATH = 'model'

In [None]:
# Set the tracking URI
mlflow.set_tracking_uri(TRACKING_URI)

# Set the experiment name
mlflow.set_experiment(EXPERIMENT_NAME)

## Load data

In [None]:
# Specify the file path of the dataset
filename = './data/diabetes.csv'

# Define the name of the target variable in the dataset
target_variable = 'target'

In [None]:
# read the file
df = pd.read_csv(filename)

## Split data

In [None]:
# Split the dataset into training and holdout sets
df_train, df_holdout = train_test_split(
    df, test_size=0.1, random_state=42
)

# Extract features (X) and target variable (y) from the training set
X_train = df_train.drop(target_variable, axis=1)  
y_train = df_train[target_variable]              

# Extract features (X) and target variable (y) from the holdout set
X_holdout = df_holdout.drop(target_variable, axis=1)  
y_holdout = df_holdout[target_variable]              

In [None]:
# Create MLflow datasets from Pandas DataFrames for training and holdout sets
train_dataset = mlflow.data.from_pandas(df_train, source=filename, targets=target_variable)
holdout_dataset = mlflow.data.from_pandas(df_holdout, source=filename, targets=target_variable)

## Grid search

In [None]:
# Set the limit for the number of pipeline configurations to explore
LIMIT = 2

# Iterate over the first LIMIT pipeline configurations from the search space
for pipeline_steps in parse_search_space(search_space)[:LIMIT]:
    # Create a pipeline using the current configuration
    pipeline = Pipeline(pipeline_steps)
    
    # Define tags for the MLflow run
    tags = {
        'estimator_name': type(pipeline['model']).__name__,  # Name of the estimator
        'estimator_class': str(type(pipeline['model']))      # Class of the estimator
    }
    
    # Start an MLflow run with the defined tags
    with mlflow.start_run(tags=tags) as run:
        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)
        
        # Log the pipeline as a MLflow model artifact
        mlflow.sklearn.log_model(
            sk_model=pipeline, 
            artifact_path=MODEL_ARTIFACT_PATH, 
            signature=mlflow.models.infer_signature(
                model_input=X_train, 
                model_output=pipeline.predict(X_train)
            )
        )

        # Log pipeline parameters
        pipeline_params = pipeline.get_params()
        mlflow.log_params(pipeline_params)

        # Evaluate pipeline using cross-validation on the training data
        cv_metrics = cross_validate(
            estimator=pipeline, 
            X=X_train, 
            y=y_train, 
            cv=5,
            return_train_score=True, 
            scoring=make_scorers_dict(metrics)
        )
        cv_metrics_aggregated = compute_aggregated_metrics(cv_metrics)
        mlflow.log_metrics(cv_metrics_aggregated)

        # Plot cross-validation metrics and log the figure
        cv_fig = plot_cv_metrics(cv_metrics)
        mlflow.log_figure(cv_fig, "graphs/cross_validation_metrics.png")

        # Evaluate pipeline on holdout data
        holdout_metrics = compute_metrics(
            y_true=y_holdout, 
            y_pred=pipeline.predict(X_holdout), 
            metrics=metrics, 
            prefix='holdout_'
        )
        mlflow.log_metrics(holdout_metrics)

        # Log SHAP explanations for the holdout predictions
        mlflow.shap.log_explanation(pipeline.predict, X_holdout)

        # Log datasets used for training and holdout
        mlflow.log_input(train_dataset, context="training")
        mlflow.log_input(holdout_dataset, context="holdout")

# Register model

In [None]:
from mlflow import MlflowClient

The client will allow interaction with MLflow tracking server to query runs, metrics, parameters, artifacts, etc.

In [None]:
# Create an MLflow client
client = MlflowClient()

## Create registered model

In [None]:
# Attempt to create a registered model in MLflow, throws the error if model already exists
try:
    client.create_registered_model(
        name=MODEL_NAME,                                
        tags={'experiment': EXPERIMENT_NAME},           
        description='Model for diabetes prediction'     
    )
except Exception as e:
    print(e)

## Get the best model from the experiment

In [None]:
# Retrieve the experiment from MLflow by its name
experiment = client.get_experiment_by_name(name=EXPERIMENT_NAME)
experiment

In [None]:
# Search for runs within the specified experiment using MLflow's search_runs function
df_runs = mlflow.search_runs(experiment_ids=experiment.experiment_id)
df_runs

In [None]:
# Sort the DataFrame of runs by the 'holdout_mean_absolute_error' metric in ascending order,
# select the top row (i.e., the run with the lowest holdout MAE),
# reset the index to start from 0, and drop the original index
df_best_run = df_runs.sort_values(by='metrics.holdout_mean_absolute_error').head(1).reset_index(drop=True)
df_best_run

In [None]:
# Retrieve the run ID of the best run from the DataFrame
run_id = df_best_run.loc[0, 'run_id']

# Get the artifact URI of the best run from the DataFrame
artifact_uri = df_best_run.loc[0, 'artifact_uri']

# Construct the source path for the model artifact using the artifact URI and MODEL_ARTIFACT_PATH
model_source = f"{artifact_uri}/{MODEL_ARTIFACT_PATH}"

print("Run ID:", run_id)
print("Artifact URI:", artifact_uri)
print("Model Source:", model_source)

## Create model version

In [None]:
# Create a model version in MLflow
mv = client.create_model_version(
    name=MODEL_NAME,        # Name of the registered model
    source=model_source,    # Source path of the model artifact
    run_id=run_id           # ID of the MLflow run associated with the model version
)
mv

## Set alias

In [None]:
# Set alias name 
ALIAS = "staging"

In [None]:
# Set an alias for the specified model version in MLflow
client.set_registered_model_alias(
    name=MODEL_NAME,          # Name of the registered model
    alias=ALIAS,          # Alias name for the model version
    version=mv.version        # Version number of the model version
)

# Load registered model

In [None]:
# Load the model with the specified alias from MLflow
model_staging = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@staging")
model_staging

In [None]:
# Predict the target variable for the holdout dataset using the staged model loaded from MLflow
y_pred_holdout = model_staging.predict(X_holdout)
y_pred_holdout

In [None]:
# Retrieve metadata information about the staged model loaded from MLflow
model_metadata = model_staging.metadata.to_dict()
model_metadata