In [1]:
# pip install --upgrade -Uqqq mlflow>=3.0 xgboost optuna uv

In [2]:
from typing import Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype
# from statsmodels.graphics.mosaicplot import mosaic

import xgboost as xgb

import mlflow
from mlflow.models import infer_signature


In [3]:
def create_regression_data(n_samples, n_features, seed = 42, noise_level= 0.3, nonlinear = True) -> Tuple[pd.DataFrame, pd.Series]:
    """Generates synthetic regression data with interesting correlations for MLflow and XGBoost demonstrations.

    This function creates a DataFrame of continuous features and computes a target variable with nonlinear
    relationships and interactions between features. The data is designed to be complex enough to demonstrate
    the capabilities of XGBoost, but not so complex that a reasonable model can't be learned.

    Args:
        n_samples (int): Number of samples (rows) to generate.
        n_features (int): Number of feature columns.
        seed (int, optional): Random seed for reproducibility. Defaults to 42.
        noise_level (float, optional): Level of Gaussian noise to add to the target. Defaults to 0.3.
        nonlinear (bool, optional): Whether to add nonlinear feature transformations. Defaults to True.

    Returns:
        Tuple[pd.DataFrame, pd.Series]:
            - pd.DataFrame: DataFrame containing the synthetic features.
            - pd.Series: Series containing the target labels.

    Example:
        >>> df, target = create_regression_data(n_samples=1000, n_features=10)
    """
    rng = np.random.RandomState(seed)
    
    # Generate random continuous features
    X = rng.uniform(-5, 5, size=(n_samples, n_features))
    
    # Create feature DataFrame with meaningful names
    columns = [f"feature_{i}" for i in range(n_features)]
    df = pd.DataFrame(X, columns=columns)
    
    # Generate base target variable with linear relationship to a subset of features
    # Use only the first n_features//2 features to create some irrelevant features
    weights = rng.uniform(-2, 2, size=n_features//2)
    target = np.dot(X[:, :n_features//2], weights)
    
    # Add some nonlinear transformations if requested
    if nonlinear:
        # Add square term for first feature
        target += 0.5 * X[:, 0]**2
        
        # Add interaction between the second and third features
        if n_features >= 3:
            target += 1.5 * X[:, 1] * X[:, 2]
        
        # Add sine transformation of fourth feature
        if n_features >= 4:
            target += 2 * np.sin(X[:, 3])
        
        # Add exponential of fifth feature, scaled down
        if n_features >= 5:
            target += 0.1 * np.exp(X[:, 4] / 2)
            
        # Add threshold effect for sixth feature
        if n_features >= 6:
            target += 3 * (X[:, 5] > 1.5).astype(float)
    
    # Add Gaussian noise
    noise = rng.normal(0, noise_level * target.std(), size=n_samples)
    target += noise
    
    # Add a few more interesting features to the DataFrame
    
    # Add a correlated feature (but not used in target calculation)
    if n_features >= 7:
        df['feature_correlated'] = df['feature_0'] * 0.8 + rng.normal(0, 0.2, size=n_samples)
    
    # Add a cyclical feature
    df['feature_cyclical'] = np.sin(np.linspace(0, 4*np.pi, n_samples))
    
    # Add a feature with outliers
    df['feature_with_outliers'] = rng.normal(0, 1, size=n_samples)
    # Add outliers to ~1% of samples
    outlier_idx = rng.choice(n_samples, size=n_samples//100, replace=False)
    df.loc[outlier_idx, 'feature_with_outliers'] = rng.uniform(10, 15, size=len(outlier_idx))
    
    return df, pd.Series(target, name='target')

In [4]:
# 2. Exploratory data analysis (EDA) visualizations
# Before training your model, it’s essential to examine your data. Visualizations help you validate that the data is as expected, 
# spot unexpected anomalies, and drive feature selection. As you move forward with model development, these visualizations 
# serve as a record of your work that can help with troubleshooting, reproducibility, and collaboration.
# You can use MLflow to log visualizations, making your experimentation fully reproducible.

In [5]:
def plot_feature_distributions(X: pd.DataFrame, y: pd.Series, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of histograms for each feature in the dataset.

    Args:
        X (pd.DataFrame): DataFrame containing synthetic features.
        y (pd.Series): Series containing the target variable.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the distribution plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            sns.histplot(X[feature], ax=ax, kde=True, color='skyblue')
            ax.set_title(f'Distribution of {feature}')
    
    # Hide any unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Feature Distributions', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_correlation_heatmap(X: pd.DataFrame, y: pd.Series) -> plt.Figure:
    """
    Creates a correlation heatmap of all features and the target variable.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.

    Returns:
        plt.Figure: The matplotlib Figure object containing the heatmap.
    """
    # Combine features and target into one DataFrame
    data = X.copy()
    data['target'] = y
    
    # Calculate correlation matrix
    corr_matrix = data.corr()
    
    # Set up the figure
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Draw the heatmap with a color bar
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap=cmap,
                center=0, square=True, linewidths=0.5, ax=ax)
    
    ax.set_title('Feature Correlation Heatmap', fontsize=16)
    plt.close(fig)
    return fig

def plot_feature_target_relationships(X: pd.DataFrame, y: pd.Series, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of scatter plots showing the relationship between each feature and the target.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the relationship plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            # Scatter plot with regression line
            sns.regplot(x=X[feature], y=y, ax=ax, 
                       scatter_kws={'alpha': 0.5, 'color': 'blue'}, 
                       line_kws={'color': 'red'})
            ax.set_title(f'{feature} vs Target')
    
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Feature vs Target Relationships', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_pairwise_relationships(X: pd.DataFrame, y: pd.Series, features: list[str]) -> plt.Figure:
    """
    Creates a pairplot showing relationships between selected features and the target.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        features (List[str]): List of feature names to include in the plot.

    Returns:
        plt.Figure: The matplotlib Figure object containing the pairplot.
    """
    # Ensure features exist in the DataFrame
    valid_features = [f for f in features if f in X.columns]
    
    if not valid_features:
        fig, ax = plt.subplots()
        ax.text(0.5, 0.5, "No valid features provided", ha='center', va='center')
        return fig
    
    # Combine selected features and target
    data = X[valid_features].copy()
    data['target'] = y
    
    # Create pairplot
    pairgrid = sns.pairplot(data, diag_kind="kde", 
                          plot_kws={"alpha": 0.6, "s": 50},
                          corner=True)
    
    pairgrid.fig.suptitle("Pairwise Feature Relationships", y=1.02, fontsize=16)
    plt.close(pairgrid.fig)
    return pairgrid.fig

def plot_boxplots(X: pd.DataFrame, y: pd.Series, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of box plots for each feature, with points colored by target value.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the box plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    # Create target bins for coloring
    y_binned = pd.qcut(y, 3, labels=['Low', 'Medium', 'High'])
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            # Box plot for each feature
            sns.boxplot(x=y_binned, y=X[feature], ax=ax)
            ax.set_title(f'Distribution of {feature} by Target Range')
            ax.set_xlabel('Target Range')
    
    # Hide any unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Feature Distributions by Target Range', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_outliers(X: pd.DataFrame, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of box plots to detect outliers in each feature.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the outlier plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            # Box plot to detect outliers
            sns.boxplot(x=X[feature], ax=ax, color='skyblue')
            ax.set_title(f'Outlier Detection for {feature}')
            ax.set_xlabel(feature)
    
    # Hide any unused subplots
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Outlier Detection for Features', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

In [6]:
# 3. Standard modeling workflow
# The code in the next cell does the following:

# Uses the function you created, create_regression_data, to create a dataset.
# Uses the visualization functions you created to create EDA plots.

In [7]:
# Create the regression dataset
n_samples = 1000
n_features = 10
X, y = create_regression_data(n_samples=n_samples, n_features=n_features, nonlinear=True)

# Create EDA plots
dist_plot = plot_feature_distributions(X, y)
corr_plot = plot_correlation_heatmap(X, y)
scatter_plot = plot_feature_target_relationships(X, y)
corr_with_target = X.corrwith(y).abs().sort_values(ascending=False)
top_features = corr_with_target.head(4).index.tolist()
pairwise_plot = plot_pairwise_relationships(X, y, top_features)
outlier_plot = plot_outliers(X)

# Configure the XGBoost model
reg = xgb.XGBRegressor(
    tree_method="hist",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='rmse',
)

# Create train/test split to properly evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7722)

# Train the model with evaluation
reg.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

# Generate predictions for residual plot
y_pred = reg.predict(X_test)
residual_plot = plot_boxplots(X, y)



In [8]:
# 4. Log the model using MLflow
# When you log a model using MLflow on Databricks, important artifacts and metadata are captured. This ensures that 
# your model is not only reproducible but also ready for deployment with all necessary dependencies and clear API contracts

In [9]:
# Incorporate MLflow evaluation
# Create a copy of the test data and add the actual target values (labels) to it
# This creates a complete dataset with both features and actual values for evaluation
evaluation_data = X_test.copy()
evaluation_data["label"] = y_test

# Start a new MLflow tracking run - think of this as opening a new experiment notebook entry
with mlflow.start_run() as run:
    
    # Get the final training error (RMSE) from the last training iteration
    # Like checking the final exam score after all study sessions
    final_train_rmse = np.array(reg.evals_result()["validation_0"]["rmse"])[-1]
    
    # Get the final testing error (RMSE) from the last validation iteration  
    # Like checking the final test score after practice tests
    final_test_rmse = np.array(reg.evals_result()["validation_1"]["rmse"])[-1]
    
    # Extract all the model settings/parameters (like learning rate, depth, etc.)
    # This is like writing down the recipe you used to bake a cake
    feature_map = {key: value for key, value in reg.get_xgb_params().items() if value is not None}

    # Create a "signature" that defines what data goes in and what comes out
    # This is like creating a label that says "Input: flour, sugar, eggs → Output: cake"
    signature = infer_signature(X, reg.predict(X))

    # Save all the model parameters to MLflow for future reference
    # Like storing the recipe card in your kitchen drawer
    mlflow.log_params(feature_map)
    
    # For MLflow 3.0+, use sklearn flavor for XGBoost scikit-learn API models
    # First, check if our model is built using sklearn style
    if hasattr(reg, '_estimator_type'):
        # If yes, save it as a sklearn-style model
        # Like storing a recipe in your "baking recipes" notebook
        model_info = mlflow.sklearn.log_model(
            sk_model=reg,                    # The actual model (our trained "brain")
            artifact_path="model",           # Folder name to save it in
            input_example=X.iloc[[0]],       # Show an example of what input looks like
            signature=signature,             # The input/output label we created
            registered_model_name="xgboost_regression_model"  # Official name in model registry
        )
    else:
        # If not sklearn-style, save it as a pure XGBoost model
        # Like storing a recipe in your "special techniques" notebook
        model_info = mlflow.xgboost.log_model(
            xgb_model=reg.get_booster(),     # Get the core engine of XGBoost
            artifact_path="model",           # Folder name to save it in
            input_example=X.iloc[[0]],       # Show an example input
            signature=signature,             # The input/output label
            registered_model_name="xgboost_regression_model"  # Official name
        )

    # Save the training performance score to MLflow
    # Like writing "Practice test score: 95%" in your study log
    mlflow.log_metric("train_rmse", final_train_rmse)
    
    # Save the testing performance score to MLflow  
    # Like writing "Final exam score: 92%" in your study log
    mlflow.log_metric("test_rmse", final_test_rmse)
    
    # Save all the visualization charts we created earlier
    # Like putting photos of your cooking process in the recipe book
    mlflow.log_figure(dist_plot, "feature_distributions.png")           # How ingredients vary
    mlflow.log_figure(corr_plot, "correlation_heatmap.png")             # How ingredients relate
    mlflow.log_figure(scatter_plot, "feature_target_relationships.png") # How ingredients affect outcome
    mlflow.log_figure(pairwise_plot, "pairwise_relationships.png")      # Ingredient pair relationships
    mlflow.log_figure(outlier_plot, "outlier_detection.png")            # Strange/unusual ingredients
    mlflow.log_figure(residual_plot, "feature_boxplots_by_target.png")  # Ingredient patterns by outcome
        
    # Create and save a chart showing which features are most important
    # Like creating a chart showing "flour is 40% important, sugar 30%, eggs 30%"
    fig, ax = plt.subplots(figsize=(10, 8))      # Create empty canvas for chart
    xgb.plot_importance(reg, ax=ax, importance_type='gain')  # Draw importance chart
    plt.title('Feature Importance')              # Add title
    plt.tight_layout()                           # Make it look neat
    plt.close(fig)                               # Close the drawing tool
    mlflow.log_figure(fig, "feature_importance.png")  # Save to MLflow

    # Run a comprehensive evaluation using the saved model
    # Like having a food critic taste and score your cake using the stored recipe
    result = mlflow.evaluate(
        model=model_info.model_uri,      # Where the saved model is stored
        data=evaluation_data,            # Test data with actual answers
        targets="label",                 # Which column has the actual answers
        model_type="regressor",          # What type of model this is
        evaluators=["default"]           # Use standard evaluation methods
    )
    
    # Print helpful information for the user
    print(f"Model saved at: {model_info.model_uri}")        # Where to find it later
    print(f"Practice score (Train RMSE): {final_train_rmse:.4f}")  # Training performance
    print(f"Test score (Test RMSE): {final_test_rmse:.4f}")        # Testing performance

2025/12/19 01:50:33 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/19 01:50:33 INFO mlflow.store.db.utils: Updating database tables
2025/12/19 01:50:33 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/19 01:50:33 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/19 01:50:33 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/19 01:50:34 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/19 01:50:34 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/19 01:50:34 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/19 01:50:34 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/19 01:50:34 INFO alembic.runtime.migration: Running 

Model saved at: models:/m-cbae450bdf2445f484e8766b551e6095
Practice score (Train RMSE): 0.9295
Test score (Test RMSE): 7.1552
