> ### **MLFLow-laboratory**
> 
> 
> #### **a. Choose a model**
> All scikit-learn models are available with one single import at `project_root/lib/models/sklearn.py`
> ```py
> # Import a model with:
> from lib.models.sklearn import RandomForestClassifier
> rfc = RandomForestClassifier()*
>
> # Or import all classes in a dict at once with:
> from lib.models.sklearn import SKLEARN_CLASSIFIERS
> # ... and instantiate them with:
> rfc = SKLEARN_CLASSIFIERS['RandomForestClassisier'](bootstrap=True)
> svc = SKLEARN_CLASSIFIERS['SVC']()
> ```
> #### **b. Select hyperparameters**
> Automatic hyperparameter tuning is performed with Optuna library,
> More information is available at [https://optuna.readthedocs.io/](https://optuna.readthedocs.io/en/stable/tutorial/index.html).  
> 
> **WIP** An hyperparameter bank is being built, see at `project_root/lib/hp/...`
> The goal is to store their name, a range if they are numerical or a list of choices.  
> Here's an example:
> ```py
> # Hyperparameters for RandomForestClassifier
> RFC_SPACE = {
>     'classifier__n_estimators': (20, 200),
>     'classifier__max_depth': (10, 100),
>     'classifier__min_samples_split': (2, 20),
>     'classifier__min_samples_leaf': (1, 2),
>     'classifier__max_features': ['sqrt', 'log2', None],
>     'classifier__criterion': ['gini', 'entropy'],
> }
> ```
> Hyperparameter naming convention is `XXXC_SPACE` (ex `KNNC_SPACE`) for classification, / XXXR_SPACE (ex `RFR_SPACE`) for regression.  
> 
>  Load hyperparameters:
> ```bash
> # Enable custom magics
> %load_ext custom_magics
> # load a set of hyperparameters
> %load_variables ../lib/hp/sklearn.py RFC_SPACE
> ```
> #### **a. Retrieve scripts**
> 
> Paste and execute one of these commands into a code cell to retrieve a mlflow script:
> 
> **Classification**  
> - Binary: `%load ../scripts/binary_classification.py`
> 
> **Regression**  
> - #TODO



#### **Base config**

In [1]:
DATASET_PATH = '../../2_DATASET_COLLECTION/churn.csv'


In [2]:
# CONFIG
import sys
sys.path.append('..')

#### **Load the model**

In [3]:
# LOAD THE MODEL
from lib.models.sklearn import SKLEARN_CLSSIFIERS
clf = SKLEARN_CLSSIFIERS['GradientBoostingClassifier']()

#### **Load hyperparameters**

In [4]:
%load_ext custom_magics

The custom_magics module is not an IPython extension.


In [5]:
#%load_variables lib/hp/sklearn.py GBC_SPACE
GBC_SPACE = {   'classifier__ccp_alpha': (0, 1),
    'classifier__criterion': ['friedman_mse', 'squared_error'],
    'classifier__learning_rate': (0.001, 1, 'log'),
    'classifier__loss': ['log_loss', 'exponential'],
    'classifier__max_depth': (2, 10),
    'classifier__max_features': [None, 'sqrt', 'log2'],
    'classifier__max_leaf_nodes': (2, 50),
    'classifier__min_impurity_decrease': (0, 1),
    'classifier__min_samples_leaf': (1, 20),
    'classifier__min_samples_split': (2, 20),
    'classifier__n_estimators': (50, 500),
    'classifier__subsample': (0.5, 1)}


#### **Load the script**

In [7]:
# %load ../scripts/binary_classification.py
# file: binary_classification.py

# FIND REGULAR IMPORTS IN laboratory/config.py
from laboratory.config import *
from laboratory.mlflow import get_or_create_experiment, set_mlflow_tracking_uri_from_env

handle_warnings()
env_vars = get_environment()
set_mlflow_tracking_uri_from_env(env_vars)

# Custom modules
import laboratory.dataset as dataset
import laboratory.sklearn as sklearn
import laboratory.tuning as tuning
from laboratory.mlflow import get_run_name
from laboratory.artifacts import log_confusion_matrix, log_roc_curve
from lib.models.sklearn import RandomForestClassifier

#################### SETUP ####################

DATASET_PATH = DATASET_PATH # Placeholder 
DF = pd.read_csv(DATASET_PATH)
TARGET_NAME = 'Exited'
FEATURES_TO_DROP = ['CustomerId', 'Surname']
DATASET_SPLIT_PARAMS = {'test_size': 0.2, 'stratify': DF[TARGET_NAME], 'random_state': 42}

CLASSIFIER = clf
SPACE = GBC_SPACE 
SAVE_MODEL = False

OPTUNA_STUDY_TRIALS = 30
OPTUNA_METRIC_TO_MAXIMIZE = 'test_f1_score'


EXPERIMENT_NAME = DATASET_PATH.split('/')[-1]
RUN_NAME = get_run_name(run_name=None)


#################### MAIN ####################

if __name__ == '__main__':
    df = DF.copy()
    df = df.drop(columns=FEATURES_TO_DROP)    
    
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=TARGET_NAME), df[TARGET_NAME], **DATASET_SPLIT_PARAMS
    )
    
    num_features = X_train.select_dtypes([np.number]).columns.tolist()
    cat_features = X_train.columns.difference(num_features).tolist()

    PREPROCESSING_PIPELINE = ColumnTransformer(
        transformers=[
            ('numerical', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), num_features),
            ('categorical', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder())
            ]), cat_features)
        ]
    )

    BINARY_CLASSIFICATION_PIPELINE = Pipeline(steps=[
        ('preprocessing', PREPROCESSING_PIPELINE),
        ('classifier', CLASSIFIER)
    ])

    experiment_id = get_or_create_experiment(EXPERIMENT_NAME)
    
    with mlflow.start_run(run_name=RUN_NAME):
        # Create an Optuna study
        study = optuna.create_study(direction='maximize')

        # Optimize the objective function
        study.optimize(
            partial(
                tuning.objective_function,
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                pipeline=BINARY_CLASSIFICATION_PIPELINE,
                param_space=SPACE,
                metric_to_maximize=OPTUNA_METRIC_TO_MAXIMIZE
            ),
            n_trials=OPTUNA_STUDY_TRIALS,
            callbacks=[tuning.champion_callback]
        )

        best_params = study.best_params
        print("BEST PARAMS FROM main(): ", best_params)

        BINARY_CLASSIFICATION_PIPELINE.set_params(**best_params)
        BINARY_CLASSIFICATION_PIPELINE.fit(X_train, y_train)
        y_pred = BINARY_CLASSIFICATION_PIPELINE.predict(X_test)
        y_pred_proba = BINARY_CLASSIFICATION_PIPELINE.predict_proba(X_test)

        metrics = tuning.get_classification_metrics(y_test, y_pred, y_pred_proba, prefix='best_model_test')
        log_confusion_matrix(y_test, y_pred)
        log_roc_curve(y_test, y_pred_proba)

        mlflow.log_params(best_params)
        mlflow.log_metrics(metrics)
        if SAVE_MODEL:
            mlflow.sklearn.log_model(sklearn, "best_model")

[I 2024-03-26 17:39:08,871] A new study created in memory with name: no-name-246f5b7c-c5e6-4e48-b585-f784af88649a


/home/mln/GIT/mlflow_laboratory/notebooks
['custom_magics.py', 'template copy.ipynb', 'backup', '__pycache__', 'test_run..ipynb', 'template.ipynb']
MINIO_ENDPOINT_URL: set
MINIO_ACCESS_KEY: set
MINIO_SECRET_KEY: set
MLFLOW_TRACKING_URI: http://localhost:5000
mlflow tracking URI has been set to  http://localhost:5000


[I 2024-03-26 17:39:09,324] Trial 0 finished with value: 0.0 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'squared_error', 'classifier__learning_rate': 0.159869835741585, 'classifier__loss': 'exponential', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 24, 'classifier__min_impurity_decrease': 1, 'classifier__min_samples_leaf': 13, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 342, 'classifier__subsample': 0.9721967735944547}. Best is trial 0 with value: 0.0.
[I 2024-03-26 17:39:10,116] Trial 1 finished with value: 0.0 and parameters: {'classifier__ccp_alpha': 1, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.9362020396921018, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': 'log2', 'classifier__max_leaf_nodes': 43, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 9, 'classifier__min_samples_split': 18, 'classif

Initial trial 9 achieved value: 0.6063348416289592


[I 2024-03-26 17:39:26,941] Trial 11 finished with value: 0.6021180030257186 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.1298584152446308, 'classifier__loss': 'exponential', 'classifier__max_depth': 10, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 16, 'classifier__min_impurity_decrease': 1, 'classifier__min_samples_leaf': 17, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 476, 'classifier__subsample': 0.9890805823575998}. Best is trial 9 with value: 0.6063348416289592.
[I 2024-03-26 17:39:27,575] Trial 12 finished with value: 0.60790273556231 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.13555982115321877, 'classifier__loss': 'log_loss', 'classifier__max_depth': 7, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 13, 'classifier__min_impurity_decrease': 1, 'classifier__min_samples_leaf': 18

Trial 12 achieved value: 0.60790273556231 with  0.2579% improvement


[I 2024-03-26 17:39:28,125] Trial 13 finished with value: 0.593167701863354 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.18802493917584895, 'classifier__loss': 'log_loss', 'classifier__max_depth': 7, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 11, 'classifier__min_impurity_decrease': 1, 'classifier__min_samples_leaf': 16, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 439, 'classifier__subsample': 0.814844491301611}. Best is trial 12 with value: 0.60790273556231.
[I 2024-03-26 17:39:28,836] Trial 14 finished with value: 0.30612244897959184 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.014149359865314376, 'classifier__loss': 'log_loss', 'classifier__max_depth': 7, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 2, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 20, '

Trial 20 achieved value: 0.6079295154185022 with  0.0044% improvement


[I 2024-03-26 17:39:34,754] Trial 21 finished with value: 0.5915080527086384 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.3454955943053342, 'classifier__loss': 'log_loss', 'classifier__max_depth': 5, 'classifier__max_features': 'log2', 'classifier__max_leaf_nodes': 3, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 18, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 440, 'classifier__subsample': 0.6738023084309211}. Best is trial 20 with value: 0.6079295154185022.
[I 2024-03-26 17:39:35,791] Trial 22 finished with value: 0.6 and parameters: {'classifier__ccp_alpha': 0, 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.10248145604714531, 'classifier__loss': 'log_loss', 'classifier__max_depth': 6, 'classifier__max_features': 'log2', 'classifier__max_leaf_nodes': 7, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 19, 'classifier__min

Trial 29 achieved value: 0.6099706744868035 with  0.3346% improvement
BEST PARAMS FROM main():  {'classifier__ccp_alpha': 0, 'classifier__criterion': 'squared_error', 'classifier__learning_rate': 0.15954636778473621, 'classifier__loss': 'exponential', 'classifier__max_depth': 7, 'classifier__max_features': 'sqrt', 'classifier__max_leaf_nodes': 10, 'classifier__min_impurity_decrease': 0, 'classifier__min_samples_leaf': 13, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 374, 'classifier__subsample': 0.943749154226001}
