In [28]:
import mlflow.pyfunc
from typing import Any, Dict, Union
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

1. Pipeline
    * preprocessing that need to be trained
    * train model
    * model explanation & visualization
2. Algorithm agnostic
    * the pipeline should be able to emcompass multiple algorithms
    * some complexities might need to be built-in for some component of the pipeline (say explanation) to ensure uniform API

Steps

1. The minimalist algorithm agnostic pipeline
    * static preprocessing
    * accept any model
    * demo different algorithms
2. Pipeline with custom preprocesser
    * custom preprocesser 1 for numeric features
    * demo the pipeline
    * customer preprocesser 2 that can handle categorical features too
    * demo using the same pipeline but calling the advanced preprocesser
3. Add explainer


# The Minimalist ALgorithm Agnostic Pipeline

* minimal preprocessing
* algorithm agnostic: take any sklearn models

In [25]:
class ML_PIPELINE(mlflow.pyfunc.PythonModel):
    """
    ML_PIPELINE is an implementation of an MLflow Python model that can be used with any 
    scikit-learn compatible model (e.g., XGBoost, LightGBM, etc.).
    This class includes methods for preprocessing input data, training the model, 
    and making predictions.

    Attributes:
        model (BaseEstimator or None): A scikit-learn compatible model instance (initialized as None).
        config (Any or None): Optional configuration for the model (initialized as None).
    """

    def __init__(self, model: BaseEstimator = None, config: Any = None):
        """
        Initialize the ML_PIPELINE with the given model and optional configuration.

        Parameters:
            model (BaseEstimator, optional): A scikit-learn compatible model (e.g., LightGBM, XGBoost).
            config (Any, optional): Optional configuration for the model.
        """
        self.model = model
        self.config = config

    def preprocess_input(self, model_input: pd.DataFrame) -> pd.DataFrame:
        """
        Preprocess the input data by dropping the first column.

        Parameters:
            model_input (pd.DataFrame): The input DataFrame to preprocess.

        Returns:
            pd.DataFrame: The preprocessed DataFrame.
        """
        processed_input = model_input.copy()
        processed_input.drop(processed_input.columns[0], axis=1, inplace=True)
        return processed_input

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """
        Train the model using the preprocessed training data.

        Parameters:
            X_train (pd.DataFrame): The training input data (features).
            y_train (pd.Series): The target values for the training data.
        """
        X_train_preprocessed = self.preprocess_input(X_train.copy())
        self.model.fit(X_train_preprocessed, y_train)

    def predict(self, context: Any, model_input: pd.DataFrame) -> Any:
        """
        Make predictions using the trained model.

        Parameters:
            context (Any): Optional context provided by MLflow during the prediction phase.
            model_input (pd.DataFrame): The input data to predict on.

        Returns:
            Any: The predicted probabilities or results.
        """
        processed_model_input = self.preprocess_input(model_input.copy())
        return self.model.predict_proba(processed_model_input)


In [29]:
params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 5,
    'random_state': 42
}
model = lgb.LGBMClassifier(**params, verbose = -1)
# model = xgb.XGBClassifier()
ml_pipeline = ML_PIPELINE(
    model = model
)

In [30]:

# Create a synthetic dataset
data = pd.DataFrame({
    'Feature1': [i + 5 for i in range(100)],  # Feature1 is a simple linear function
    'Feature2': [2 * i - 10 for i in range(100)],  # Feature2 has a linear relationship
    'Feature3': np.random.uniform(low=0.0, high=1.0, size=100),  # Random values between 0 and 1
    'Target': [1 if (i + 5) + 2 * i - 10 > 30 else 0 for i in range(100)]  # Target depends on Feature1 and Feature2
})

X = data.drop('Target', axis=1)
y = data['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ml_pipeline.fit(X_train, y_train)
y_pred = ml_pipeline.predict(context = None, model_input = X_test)