> ### **MLFLow-laboratory**
> 
> 
> #### **a. Choose a model**
> All scikit-learn models are available with one single import at `project_root/lib/models/sklearn.py`
> ```py
> # Import a model with:
> from lib.models.sklearn import RandomForestClassifier
> rfc = RandomForestClassifier()*
>
> # Or import all classes in a dict at once with:
> from lib.models.sklearn import SKLEARN_CLASSIFIERS
> # ... and instantiate them with:
> rfc = SKLEARN_CLASSIFIERS['RandomForestClassisier'](bootstrap=True)
> svc = SKLEARN_CLASSIFIERS['SVC']()
> ```
> #### **b. Select hyperparameters**
> Automatic hyperparameter tuning is performed with Optuna library,
> More information is available at [https://optuna.readthedocs.io/](https://optuna.readthedocs.io/en/stable/tutorial/index.html).  
> 
> **WIP** An hyperparameter bank is being built, see at `project_root/lib/hp/...`
> The goal is to store their name, a range if they are numerical or a list of choices.  
> Here's an example:
> ```py
> # Hyperparameters for RandomForestClassifier
> RFC_SPACE = {
>     'classifier__n_estimators': (20, 200),
>     'classifier__max_depth': (10, 100),
>     'classifier__min_samples_split': (2, 20),
>     'classifier__min_samples_leaf': (1, 2),
>     'classifier__max_features': ['sqrt', 'log2', None],
>     'classifier__criterion': ['gini', 'entropy'],
> }
> ```
> Hyperparameter naming convention is `XXXC_SPACE` (ex `KNNC_SPACE`) for classification, / XXXR_SPACE (ex `RFR_SPACE`) for regression.  
> 
>  Load hyperparameters:
> ```bash
> # Enable custom magics
> %load_ext custom_magics
> # load a set of hyperparameters
> %load_variables ../lib/hp/sklearn.py RFC_SPACE
> ```
> #### **a. Retrieve scripts**
> 
> Paste and execute one of these commands into a code cell to retrieve a mlflow script:
> 
> **Classification**  
> - Binary: `%load ../scripts/binary_classification.py`
> 
> **Regression**  
> - #TODO



#### **Base config**

In [1]:
# CONFIG
import sys
sys.path.append('..')

#### **Load the model**

In [2]:
# LOAD THE MODEL
from lib.models.sklearn import SKLEARN_CLSSIFIERS
rfc = SKLEARN_CLSSIFIERS['RandomForestClassifier'](bootstrap=False)

#### **Load hyperparameters**

In [3]:
%load_ext custom_magics

The custom_magics module is not an IPython extension.


In [4]:
#%load_variables lib/hp/sklearn.py RFC_SPACE
RFC_SPACE = {   'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': (10, 100),
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__min_samples_leaf': (1, 2),
    'classifier__min_samples_split': (2, 20),
    'classifier__n_estimators': (20, 200)}


#### **Load the script**

In [6]:
# %load ../scripts/binary_classification.py
# file: binary_classification.py

# FIND REGULAR IMPORTS IN laboratory/config.py
from laboratory.config import *
from laboratory.mlflow import get_or_create_experiment, set_mlflow_tracking_uri_from_env

handle_warnings()
env_vars = get_environment()
set_mlflow_tracking_uri_from_env(env_vars)

# Custom modules
import laboratory.dataset as dataset
import laboratory.pipeline as pipeline
import laboratory.tuning as tuning
from laboratory.artifacts import log_confusion_matrix, log_roc_curve
from lib.hp.sklearn import RFC_SPACE
from lib.models.sklearn import RandomForestClassifier


#################### SETUP ####################
EXPERIMENT_NAME = 'optuna_experiment14'
RUN_NAME = 'hyperparameter_optimization'
DATASET_PATH = '../../../0_DATASETS/creditcard.csv'
CLASSIFIER = RandomForestClassifier(bootstrap=False)
SPACE = RFC_SPACE 
TARGET_NAME = 'Class'
SAVE_MODEL = False

#################### MAIN ####################

if __name__ == '__main__':
    df = pd.read_csv(DATASET_PATH)
    df_redux = dataset.data_split_redux(df, TARGET_NAME, zero_label_redux=0.995)
    
    X_train, X_test, y_train, y_test = dataset.train_test_split(
        df_redux.drop(columns=TARGET_NAME), df_redux[TARGET_NAME], test_size=0.2, random_state=42
    )
    
    num_features = X_train.select_dtypes([np.number]).columns.tolist()
    cat_features = X_train.columns.difference(num_features).tolist()
    
    pipeline = pipeline.get_binary_rfc_pipeline(num_features, cat_features)
        
    experiment_id = get_or_create_experiment(EXPERIMENT_NAME)
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME) # DEBUGGING
    print(f"Experiment: {experiment.name}, ID: {experiment.experiment_id}") # DEBUGGING
    
    with mlflow.start_run(run_name=RUN_NAME):
        # Create an Optuna study
        study = optuna.create_study(direction='maximize')

        # Optimize the objective function
        study.optimize(
            partial(
                tuning.objective_function,
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                pipeline=pipeline,
                param_space=RFC_SPACE
            ),
            n_trials=10,
            callbacks=[tuning.champion_callback]
        )

        best_params = study.best_params
        print("BEST PARAMS FROM main(): ", best_params)

        pipeline.set_params(**best_params)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)

        metrics = tuning.get_classification_metrics(y_test, y_pred, y_pred_proba, prefix='best_model_test')
        log_confusion_matrix(y_test, y_pred)
        log_roc_curve(y_test, y_pred_proba)

        mlflow.log_params(best_params)
        mlflow.log_metrics(metrics)
        if SAVE_MODEL:
            mlflow.sklearn.log_model(pipeline, "best_model")



 

/home/mln/git_explore/perso/mlflow_laboratory/notebooks
['custom_magics.py', '__pycache__', 'template copy 2.ipynb', 'template copy 3.ipynb']
MINIO_ENDPOINT_URL: set
MINIO_ACCESS_KEY: set
MINIO_SECRET_KEY: set
MLFLOW_TRACKING_URI: http://localhost:5000
mlflow tracking URI has been set to  http://localhost:5000


[I 2024-03-26 13:34:20,250] A new study created in memory with name: no-name-b07b28b7-bce8-4d87-9c2a-55ac50ac6abc
[I 2024-03-26 13:34:20,401] Trial 0 finished with value: 0.8955223880597015 and parameters: {'classifier__n_estimators': 40, 'classifier__max_depth': 67, 'classifier__min_samples_split': 19, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'log2', 'classifier__criterion': 'entropy'}. Best is trial 0 with value: 0.8955223880597015.


Experiment: optuna_experiment14, ID: 3
Initial trial 0 achieved value: 0.8955223880597015


[I 2024-03-26 13:34:20,752] Trial 1 finished with value: 0.9064039408866995 and parameters: {'classifier__n_estimators': 21, 'classifier__max_depth': 10, 'classifier__min_samples_split': 19, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__criterion': 'gini'}. Best is trial 1 with value: 0.9064039408866995.


Trial 1 achieved value: 0.9064039408866995 with  1.2005% improvement


[I 2024-03-26 13:34:22,523] Trial 2 finished with value: 0.916256157635468 and parameters: {'classifier__n_estimators': 107, 'classifier__max_depth': 43, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__criterion': 'gini'}. Best is trial 2 with value: 0.916256157635468.


Trial 2 achieved value: 0.916256157635468 with  1.0753% improvement


[I 2024-03-26 13:34:22,794] Trial 3 finished with value: 0.9 and parameters: {'classifier__n_estimators': 84, 'classifier__max_depth': 38, 'classifier__min_samples_split': 13, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'log2', 'classifier__criterion': 'gini'}. Best is trial 2 with value: 0.916256157635468.
[I 2024-03-26 13:34:25,248] Trial 4 finished with value: 0.9108910891089109 and parameters: {'classifier__n_estimators': 159, 'classifier__max_depth': 82, 'classifier__min_samples_split': 8, 'classifier__min_samples_leaf': 2, 'classifier__max_features': None, 'classifier__criterion': 'gini'}. Best is trial 2 with value: 0.916256157635468.
[I 2024-03-26 13:34:25,644] Trial 5 finished with value: 0.9 and parameters: {'classifier__n_estimators': 121, 'classifier__max_depth': 68, 'classifier__min_samples_split': 18, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__criterion': 'entropy'}. Best is trial 2 with value: 0.916256157635468.

BEST PARAMS FROM main():  {'classifier__n_estimators': 107, 'classifier__max_depth': 43, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_features': None, 'classifier__criterion': 'gini'}
