> ### **MLFLow-laboratory**
> 
> 
> #### **a. Choose a model**
> All scikit-learn models are available with one single import at `project_root/lib/models/sklearn.py`
> ```py
> # Import a model with:
> from lib.models.sklearn import RandomForestClassifier
> rfc = RandomForestClassifier()*
>
> # Or import all classes in a dict at once with:
> from lib.models.sklearn import SKLEARN_CLASSIFIERS
> # ... and instantiate them with:
> rfc = SKLEARN_CLASSIFIERS['RandomForestClassisier'](bootstrap=True)
> svc = SKLEARN_CLASSIFIERS['SVC']()
> ```
> #### **b. Select hyperparameters**
> Automatic hyperparameter tuning is performed with Optuna library,
> More information is available at [https://optuna.readthedocs.io/](https://optuna.readthedocs.io/en/stable/tutorial/index.html).  
> 
> **WIP** An hyperparameter bank is being built, see at `project_root/lib/hp/...`
> The goal is to store their name, a range if they are numerical or a list of choices.  
> Here's an example:
> ```py
> # Hyperparameters for RandomForestClassifier
> RFC_SPACE = {
>     'classifier__n_estimators': (20, 200),
>     'classifier__max_depth': (10, 100),
>     'classifier__min_samples_split': (2, 20),
>     'classifier__min_samples_leaf': (1, 2),
>     'classifier__max_features': ['sqrt', 'log2', None],
>     'classifier__criterion': ['gini', 'entropy'],
> }
> ```
> Hyperparameter naming convention is `XXXC_SPACE` (ex `KNNC_SPACE`) for classification, / XXXR_SPACE (ex `RFR_SPACE`) for regression.  
> 
>  Load hyperparameters:
> ```bash
> # Enable custom magics
> %load_ext custom_magics
> # load a set of hyperparameters
> %load_variables ../lib/hp/sklearn.py RFC_SPACE
> ```
> #### **c. Retrieve scripts**
> 
> Paste and execute one of these commands into a code cell to retrieve a mlflow script:
> 
> **Classification**  
> - Binary: `%load ../scripts/binary_classification.py`
> 
> **Regression**  
> - #TODO



#### **Base config**

In [1]:
# CONFIG
import sys
sys.path.append('..')

#### **Fill dataset info**

In [2]:
DATASET_PATH = 'test_datasets/test_bin_class_loans.csv'
TARGET_NAME = 'Loan_Status'
FEATURES_TO_DROP = None

#### **Load the model**

In [3]:
# LOAD THE MODEL
from lib.models.sklearn import SKLEARN_CLSSIFIERS
clf = SKLEARN_CLSSIFIERS['RandomForestClassifier'](bootstrap=False)

#### **Load hyperparameters**

In [4]:
%load_ext custom_magics

The custom_magics module is not an IPython extension.


In [5]:
#%load_variables lib/hp/sklearn.py RFC_SPACE
RFC_SPACE = {   'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': (10, 100),
    'classifier__max_features': ['sqrt', 'log2', None],
    'classifier__min_samples_leaf': (1, 2),
    'classifier__min_samples_split': (2, 20),
    'classifier__n_estimators': (20, 200)}


#### **Load the script**

In [6]:
# %load ../scripts/binary_classification.py
# file: binary_classification.py

# FIND REGULAR IMPORTS IN laboratory/config.py
from laboratory.config import *
from laboratory.mlflow import get_or_create_experiment, set_mlflow_tracking_uri_from_env

handle_warnings()
env_vars = get_environment()
set_mlflow_tracking_uri_from_env(env_vars)

# Custom modules
import laboratory.dataset as dataset
import laboratory.sklearn as sklearn
import laboratory.tuning as tuning
from laboratory.mlflow import get_run_name
from laboratory.artifacts import log_confusion_matrix, log_roc_curve
from lib.models.sklearn import RandomForestClassifier # Placeholder

###############################################
#################### SETUP ####################
######## Fill experiment input here ########### 

DATASET_PATH = DATASET_PATH # Placeholder 
DF = pd.read_csv(DATASET_PATH)

TARGET_NAME = TARGET_NAME # Placeholder
FEATURES_TO_DROP = FEATURES_TO_DROP # Placeholder

DATASET_SPLIT_PARAMS = {'test_size': 0.2, 'stratify': DF[TARGET_NAME], 'random_state': 42}

CLASSIFIER = clf # Placeholder
SPACE = RFC_SPACE # Placeholder
SAVE_MODEL = False

OPTUNA_STUDY_TRIALS = 20
OPTUNA_METRIC_TO_MAXIMIZE = 'f1_score'  


EXPERIMENT_NAME = DATASET_PATH.split('/')[-1] # Is file name by default
RUN_NAME = get_run_name(run_name=None)


###############################################
#################### MAIN #####################

if __name__ == '__main__':
    df = DF.copy()
    
    if FEATURES_TO_DROP is not None:
        df = df.drop(columns=FEATURES_TO_DROP)    
    
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=TARGET_NAME), df[TARGET_NAME], **DATASET_SPLIT_PARAMS
    )
    
    num_features = X_train.select_dtypes([np.number]).columns.tolist()
    cat_features = X_train.columns.difference(num_features).tolist()

    PREPROCESSING_PIPELINE = ColumnTransformer(
        transformers=[
            ('numerical', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), num_features),
            ('categorical', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder())
            ]), cat_features)
        ]
    )

    CLASSIFICATION_PIPELINE = Pipeline(steps=[
        ('preprocessing', PREPROCESSING_PIPELINE),
        ('classifier', CLASSIFIER)
    ])

    experiment_id = get_or_create_experiment(EXPERIMENT_NAME)
    
    with mlflow.start_run(run_name=RUN_NAME):
        # Create an Optuna study
        study = optuna.create_study(direction='maximize')

        # Optimize the objective function
        study.optimize(
            partial(
                tuning.objective_function,
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                pipeline=CLASSIFICATION_PIPELINE,
                param_space=SPACE,
                metric_to_maximize=OPTUNA_METRIC_TO_MAXIMIZE
            ),
            n_trials=OPTUNA_STUDY_TRIALS,
            callbacks=[tuning.champion_callback]
        )

        best_params = study.best_params
        print("BEST PARAMS FROM main(): ", best_params)

        CLASSIFICATION_PIPELINE.set_params(**best_params)
        CLASSIFICATION_PIPELINE.fit(X_train, y_train)
        y_pred = CLASSIFICATION_PIPELINE.predict(X_test)
        y_pred_proba = CLASSIFICATION_PIPELINE.predict_proba(X_test)

        metrics = tuning.get_classification_metrics(y_test, y_pred, y_pred_proba)
        log_confusion_matrix(y_test, y_pred)
        log_roc_curve(y_test, y_pred_proba)

        mlflow.log_params(best_params)
        mlflow.log_metrics(metrics)
        if SAVE_MODEL:
            mlflow.sklearn.log_model(sklearn, "best_model")

[I 2024-03-28 09:43:17,764] A new study created in memory with name: no-name-bedf1bf0-31a3-4ebc-a7dc-d2349606f719


/home/mln/GIT/mlflow_laboratory/notebooks
['custom_magics.py', 'backup', '__pycache__', 'TEMPLATE copy.ipynb', 'test_datasets', 'TEMPLATE.ipynb']
MINIO_ENDPOINT_URL: set
MINIO_ACCESS_KEY: set
MINIO_SECRET_KEY: set
MLFLOW_TRACKING_URI: http://localhost:5000
mlflow tracking URI has been set to  http://localhost:5000


[I 2024-03-28 09:43:17,887] Trial 0 finished with value: 0.8764044943820225 and parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 83, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 4, 'classifier__n_estimators': 107}. Best is trial 0 with value: 0.8764044943820225.
[I 2024-03-28 09:43:17,980] Trial 1 finished with value: 0.9060773480662984 and parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 40, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 14, 'classifier__n_estimators': 92}. Best is trial 1 with value: 0.9060773480662984.


Initial trial 0 achieved value: 0.8764044943820225
Trial 1 achieved value: 0.9060773480662984 with  3.2749% improvement


[I 2024-03-28 09:43:18,132] Trial 2 finished with value: 0.8852459016393442 and parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 56, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 18, 'classifier__n_estimators': 184}. Best is trial 1 with value: 0.9060773480662984.
[I 2024-03-28 09:43:18,396] Trial 3 finished with value: 0.7924528301886793 and parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 33, 'classifier__max_features': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 158}. Best is trial 1 with value: 0.9060773480662984.
[I 2024-03-28 09:43:18,540] Trial 4 finished with value: 0.8901098901098901 and parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 60, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 146}. Best is trial

Trial 13 achieved value: 0.907103825136612 with  0.1132% improvement


[I 2024-03-28 09:43:20,028] Trial 16 finished with value: 0.88268156424581 and parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 44, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 13, 'classifier__n_estimators': 89}. Best is trial 13 with value: 0.907103825136612.
[I 2024-03-28 09:43:20,163] Trial 17 finished with value: 0.9010989010989011 and parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 20, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 16, 'classifier__n_estimators': 133}. Best is trial 13 with value: 0.907103825136612.
[I 2024-03-28 09:43:20,282] Trial 18 finished with value: 0.8024691358024691 and parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 67, 'classifier__max_features': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 12, 'classifier__n_estimators': 52}. Best is trial

BEST PARAMS FROM main():  {'classifier__criterion': 'entropy', 'classifier__max_depth': 11, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 14, 'classifier__n_estimators': 82}
