# Import Libs

In [None]:
import pandas as pd
import numpy as np
import mlflow
import importlib
from import_data import extract_data
import preprocessors as pp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (train_test_split,
                                     cross_val_score)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (MinMaxScaler)
from sklearn.metrics import (classification_report,
                             confusion_matrix, 
                             precision_score,
                             recall_score,
                             f1_score,
                             auc, 
                             roc_auc_score, 
                             ConfusionMatrixDisplay, 
                             RocCurveDisplay,
                             make_scorer)
from sklearn.feature_selection import RFECV, SelectFromModel
from sklearn.decomposition import PCA

from feature_engine.imputation import (MeanMedianImputer,
                                       ArbitraryNumberImputer, 
                                       CategoricalImputer)
from feature_engine.selection import (DropFeatures, DropDuplicateFeatures, DropConstantFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination)
from feature_engine.encoding import (OneHotEncoder, OrdinalEncoder)
from feature_engine.creation import RelativeFeatures
from feature_engine.discretisation import EqualWidthDiscretiser, EqualFrequencyDiscretiser
from feature_engine.outliers import Winsorizer
from feature_engine.transformation import YeoJohnsonTransformer

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
import shap

import optuna

import mlflow
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids, NearMiss

importlib.reload(pp)

Initiate mlflow tracking 

In [None]:
mlflow.set_tracking_uri("/home/user/Projects/mlflow/mlruns")
mlflow.set_experiment('experiment_name')
mlflow.get_tracking_uri()

Import the dataset

In [None]:
filename = 'mrt_project_features'
df = pd.read_csv(f'data/{filename}.csv')

# EDA before transform

Check data by months

In [None]:
df.groupby(by='Load_month')['CustomerId'].count().sort_index(ascending=False)

Check column dtypes

In [None]:
df.dtypes.value_counts()

In [None]:
obj_cols_lst = df.select_dtypes(include='O').columns.tolist()
num_cols_lst = df.select_dtypes(include=['float64','int64']).columns.tolist()
bool_cols_lst = df.select_dtypes(include=['bool']).columns.tolist()
dt_cols_lst = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()

In [None]:
#find date columns imported as object columns

dt_lst = ['date', 'month', 'today']

dt_add_lst = []

for col in obj_cols_lst:
    for dt in dt_lst:
        if dt in col.lower():
            dt_add_lst.append(col)

dt_cols_lst = [] + dt_add_lst

Check data leakage by dates

In [None]:
#Check if dates from collected features are late than load month

check_dt_lst = []
for col in dt_cols_lst:
    if any(pd.to_datetime(df[col]).dt.date > pd.to_datetime(df['Load_month']).dt.date):
        check_dt_lst.append(col)

In [None]:
#print columns with later dates

for col in check_dt_lst:
    print(col)
    print('less than load date:', df.loc[pd.to_datetime(df[col]) < pd.to_datetime(df['Load_month']),[col]].shape[0])
    print('more than load date:', df.loc[pd.to_datetime(df[col]) > pd.to_datetime(df['Load_month']),[col]].shape[0])
    print('')

Check Nan

In [None]:
df[num_cols_lst].isnull().mean().sort_values(ascending=False)

# Split Data

In [None]:
drop_target_lst = ['target1','target2','target2']

target = 'target1'
test_size1 = 0.1
test_size2 = 0.3

X_train, X_dev, y_train, y_dev = train_test_split(
    df.drop(columns=drop_target_lst+['CustomerId','AccountId'], axis=1),
    df[target],
    test_size=test_size1,
    random_state=32
)

X_test, X_val, y_test, y_val = train_test_split(
    X_dev,
    y_dev,
    test_size=test_size2,
    random_state=32
)

del X_dev
del y_dev
del df

# Drop features

Drop Constant features

In [None]:
# feature engine's drop constant features are used to remove constant features with a threshold of 0.9

drop_transformer = DropConstantFeatures(tol=0.9, missing_values='ignore')
drop_transformer.fit(X_train.fillna(0))
drop_cols_c = drop_transformer.features_to_drop_

Drop Duplicate features

In [None]:
#feature engine's drop duplicate features used to remove similar features

duplicate_transformer = DropDuplicateFeatures()
duplicate_transformer.fit(X_train.drop(columns=drop_cols_c, inplace=True))
drop_cols_d = duplicate_transformer.features_to_drop_

# Config

In [None]:
#Most of the columns are hidden due to confidentiality concerns
#Empty lists are shown to help understand what preprocessing steps are used

obj_to_dt_cols = []

lower_obj_cols = []

left2_obj_cols = []

onehot_obj_cols = []

ordinal_obj_cols = []


mappings = {
    'Column1': {
        'attribute1':1,
        'attribute2' : 2
    },
}

rel_to_amount = []
rel_to_princbal = []
rel_to_disbterm = []
rel_to_pmt = []

Avg_mon_Transfer_Count_lst = []
Avg_mon_Transfer_Count_lst_all = []


Avg_mon_Transfer_Amount_lst = []
Avg_mon_Transfer_Amount_lst_all = []

request_count_lst = []
request_count_lst_all = []

offered_rate_lst = []
offered_rate_lst_all = []

disbursed_rate_lst = []
disbursed_rate_lst_all = []

to_date = 'Load_month'

date_cols = obj_to_dt_cols

# Pipeline Architecture

In [None]:
model_name = 'lgb'
version = 'v6'
train_period = 'last12M'
sel_used = 'Yes'
smote_used = 'No'
hps_tuned = 'Yes'

#these paramters are saved for MLflow run details

run_name = f'{target}_{model_name}_{version}_{train_period}_sel{sel_used}'
run_descr = f"""Reitaration of features,
Target used in this run: {target} 
Model used in this run: {model_name},
version of the project: {version},
period used: {train_period},
test+dev size used: {test_size1},
feature selections was used: {sel_used},
HP were optimized: {hps_tuned},
Smote used: {smote_used}
Removed features - in_form_inq_workplace_wingslast,'in_form_actual_city_wings','in_form_actual_region_wings', 'fn_region_onboarding_wings'
Dropped constant features, 
"""
 
#exact hyper parameter values are removed because of confidentiality

if model_name == 'lgb':
    train_params = {
        'bagging_fraction': x,
        'bagging_freq': x,
        'colsample_bytree': x,
        'feature_fraction': x,
        'lambda_l1': x,
        'lambda_l2': x,
        'learning_rate': x,
        'max_depth': X,
        'min_child_samples': X,
        'min_data_in_leaf': X,
        'min_gain_to_split': x,
        'n_estimators': x,
        'num_leaves': x,
        'subsample': x
        }
    model = lgb.LGBMClassifier(**train_params)

elif model_name == 'catb': 
    train_params = {'iterations': x,  
                    'learning_rate': x, 
                    'depth': x, 
                    'l2_leaf_reg': x, 
                    'bagging_temperature': x, 
                    'random_strength': x, 
                    'loss_function': x, 
                    'eval_metric': x,
                    'random_seed': x}
    model = CatBoostClassifier(**train_params)

elif model_name == 'xgb':
    train_params = {
        'n_estimators': x,
        'learning_rate': x,
        'max_depth': x,
        'random_state': x
    }

    model = xgb.XGBClassifier(**train_params)

#here you can find sklearn pipeline
#steps which print statuses are added to help with debugging during training. These steps are removed in inference
#I moslty use feature engine's built in transformers and my custom transformer (which start with pp.)

pipeline = Pipeline([
    ('pipeline_start', pp.PrintStatus('pipeline_initiation')),
    ('text_to_date_calc', pp.TextToDate([to_date])),
    ('text_to_date_calc_status', pp.PrintStatus('text_to_date_calc')),
    ('text_to_date', pp.TextToDate(obj_to_dt_cols)),
    ('text_to_date_status', pp.PrintStatus('text_to_date')),
    ('impute_txt', CategoricalImputer(fill_value='NA')),
    ('impute_txt_status', pp.PrintStatus('impute_txt')),  
    ('lower_text', pp.LowerText(lower_obj_cols)),
    ('lower_text_status', pp.PrintStatus('lower_text')),
    ('left_2_text', pp.Left2Text(left2_obj_cols)),
    ('left_2_text_status', pp.PrintStatus('left_2_text')),
    ('get_dummies', OneHotEncoder(variables=onehot_obj_cols)),
    ('get_dummies_status', pp.PrintStatus('get_dummies')),
    ('ordinal_encoding', OrdinalEncoder(variables=ordinal_obj_cols, unseen='encode')),
    ('ordinal_encoding_status', pp.PrintStatus('ordinal_encoding')),
    ('mappers', pp.Mapper(variables=[key for key in mappings], mappings=mappings)),
    ('mappers_status', pp.PrintStatus('mappers')),
    ('date_to_date', pp.DateToDate(variables=date_cols, mapping=to_date)),
    ('date_to_date_status', pp.PrintStatus('date_to_date')),
    ('zero_imputer', ArbitraryNumberImputer(arbitrary_number=0)),
    ('zero_imputer_status', pp.PrintStatus('zero_imputer')),
    ('drop_calc_date', DropFeatures(features_to_drop=[to_date])),
    ('drop_calc_date_status', pp.PrintStatus('drop_calc_date')),   
    ('div_by_princbal', RelativeFeatures(variables=rel_to_princbal, reference=['PrincipalBalance_snaps'], func=['div'], drop_original=False)),   
    ('div_by_princbal_status', pp.PrintStatus('div_by_princbal')),  
    ('div_by_amount', RelativeFeatures(variables=rel_to_amount, reference=['Amount_snaps'], func=['div'], drop_original=False)),   
    ('div_by_amount_status', pp.PrintStatus('div_by_amount')),
    ('div_by_disbterm', RelativeFeatures(variables=rel_to_disbterm, reference=['Disbursed_term_snaps'], func=['div'], drop_original=False)),   
    ('div_by_disbterm_status', pp.PrintStatus('div_by_disbterm')), 
    ('div_by_transfercount', RelativeFeatures(variables=Avg_mon_Transfer_Count_lst, reference=Avg_mon_Transfer_Count_lst_all, func=['div'], drop_original=True, fill_value=0)),
    ('div_by_transfercount_status', pp.PrintStatus('div_by_transfercount')), 
    ('div_by_transferamount', RelativeFeatures(variables=Avg_mon_Transfer_Amount_lst, reference=Avg_mon_Transfer_Amount_lst_all, func=['div'], drop_original=True, fill_value=0)),
    ('div_by_transferamount_status', pp.PrintStatus('div_by_transferamount')), 
    ('div_by_request_count', RelativeFeatures(variables=request_count_lst, reference=request_count_lst_all, func=['div'], drop_original=True, fill_value=0)),
    ('div_by_request_count_status', pp.PrintStatus('div_by_request_count')), 
    ('div_by_offered_rate', RelativeFeatures(variables=offered_rate_lst, reference=offered_rate_lst_all, func=['div'], drop_original=True, fill_value=0)),
    ('div_by_offered_rate_status', pp.PrintStatus('div_by_offered_rate')), 
    ('div_by_disbursed_rate', RelativeFeatures(variables=disbursed_rate_lst, reference=disbursed_rate_lst_all, func=['div'], drop_original=True, fill_value=0)),
    ('div_by_disbursed_rate_status', pp.PrintStatus('div_by_disbursed_rate')), 
    ('outlier_cap', Winsorizer(capping_method='gaussian')),
    ('outlier_cap_status', pp.PrintStatus('outlier_cap')), 
    ('yeojo_transform', YeoJohnsonTransformer() ),    
    ('yeojo_transform_status', pp.PrintStatus('yeojo_transform')), 
    ('discretiser', EqualFrequencyDiscretiser()),
    ('discretiser_status', pp.PrintStatus('discretiser')),  
    ('scaler', MinMaxScaler()),
    ('preprocessing', pp.PrintStatus('preprocessing'))
])

pipeline.steps.append(('model', model))

# EDA After transform

In [None]:
#transform data up to some point of pipeline to have numerical variables
step_index = list(pipeline.named_steps.keys()).index('date_to_date')
X_train_for_eda = pipeline[:step_index+1].fit_transform(X_train, y_train)

Check Correlation with target variable

In [None]:
#check for column correlated with target variable
X_train_for_eda.corrwith(y_train).fillna(0).sort_values()

Data Distribution

In [None]:
#check data distribution up unitl to some point of the pipeline

data = X_train_for_eda_outlier_yj
columns_for_hist = data.columns.tolist()

ncols=3 
nrows=math.ceil(len(columns_for_hist)/ncols)

fig, axes = plt.subplots(nrows=nrows,ncols=ncols,figsize=[ncols * 5, nrows * 4])
for col, ax in zip(columns_for_hist,axes.flatten()):
    print(col)
    sns.histplot(data=data, x=col, ax=ax)    
    # ax.set_title(col)   

# RUN

In [None]:
# mlflow tracking is used for experiment tracking
# pipeline, its parameters, feature importances and metrics are saved
# all of this can be accessed through Mlflow ui to check past pipeline runs

with mlflow.start_run(run_name=run_name, description=run_descr) as run:
    if model_name == 'lgb':
        mlflow.lightgbm.autolog()  

    ##fit and log model
    pipeline.fit(X_train, y_train)
    # mlflow.log_model(pipeline, "pipeline")
    mlflow.sklearn.log_model(pipeline, "pipeline")
    
    ##predict labels
    y_pred = pipeline.predict(X_test)
    y_score = pipeline.predict_proba(X_test)[:, 1]
    y_pred_train = pipeline.predict(X_train)

    ##classification report for test set
    report = classification_report(y_test, y_pred)
    mlflow.log_text(report, 'test_classification_report.txt')
    print('Test Classification Report')
    print(report)

    ##classification report for train set
    report = classification_report(y_train, y_pred_train)
    mlflow.log_text(report, 'train_classification_report.txt')
    print('Train Classification Report')
    print(report)

    ##test f1 
    test_precision_label1 = precision_score(y_test, y_pred, pos_label=1)
    test_recall_label1 = recall_score(y_test, y_pred, pos_label=1)
    test_f1_label1 = f1_score(y_test, y_pred, pos_label=1)

    ##confusion matrix for test set
    cm = ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_pred)
    mlflow.log_figure(cm.figure_, 'test_confusion_matrix.png')

    ##log feature importance
    feat_imp_df = pd.DataFrame(zip(pipeline[-5].get_feature_names_out(),pipeline[-1].feature_importances_),columns=['Feature','Value']).sort_values('Value', ascending=False)
    fig, ax1 = plt.subplots(figsize=(8,15))
    sns.barplot(feat_imp_df.head(100), x='Value', y='Feature', ax=ax1)
    plt.title('Top 100 Features by Importance')
    plt.tight_layout()
    mlflow.log_figure(fig, 'Feature_Importances.png')

    ##roc auc score for test set
    ras = roc_auc_score(y_test, y_score)    

    ##log all metrics
    metrics = {'test_roc_auc_score':ras,
               'test_precision_label1' : test_precision_label1,
               'test_recall_label1': test_recall_label1,
               'test_f1_label1': test_f1_label1}
    
    mlflow.log_metrics(metrics)

    ##log metrics by months
    X_test_pred = X_test.copy()
    X_test_pred['actual_label'] = y_test
    X_test_pred['predicted_label'] = y_pred
    
    test_precision_label1_lst = []
    test_recall_label1_lst = []
    test_f1_label1_lst = []
    total_customers_lst = []
    y_test_count_lst = []
    y_pred_count_lst = []
    mon_lst = []
    
    for mon in X_test_pred['Load_month'].unique():        
        filter = X_test['Load_month']==mon
        y_test_mon = X_test_pred.loc[filter,'actual_label']
        y_pred_mon = X_test_pred.loc[filter,'predicted_label'] 
    
        test_precision_label1_mon = precision_score(y_test_mon, y_pred_mon, pos_label=1)
        test_recall_label1_mon = recall_score(y_test_mon, y_pred_mon, pos_label=1)
        test_f1_label1_mon = f1_score(y_test_mon, y_pred_mon, pos_label=1)
    
        total_customers_mon = X_test_pred.loc[filter,'Load_month'].count()
        y_test_count_mon = X_test_pred.loc[filter,'actual_label'].sum()
        y_pred_count_mon = X_test_pred.loc[filter,'predicted_label'].sum()
    
        test_precision_label1_lst.append(test_precision_label1_mon)
        test_recall_label1_lst.append(test_recall_label1_mon)
        test_f1_label1_lst.append(test_f1_label1_mon)
        total_customers_lst.append(total_customers_mon)
        y_test_count_lst.append(y_test_count_mon)
        y_pred_count_lst.append(y_pred_count_mon)
        mon_lst.append(mon)
    
    test_pred_mon_df = pd.DataFrame({'month':mon_lst,
                                     'total_customers':total_customers_lst, 
                                     'y_actual_count':y_test_count_lst,
                                     'y_pred_count':y_pred_count_lst,
                                     'precision_label1':test_precision_label1_lst,
                                     'recall_label1':test_recall_label1_lst,
                                     'f1_label1':test_f1_label1_lst
                                     })
    
    del X_test_pred

    fig, ax1 = plt.subplots(figsize=(12,8))
    
    test_pred_mon_df_melt = pd.melt(test_pred_mon_df, id_vars='month').sort_values('month')
    sns.barplot(data=test_pred_mon_df_melt.loc[test_pred_mon_df_melt['variable'].isin(['total_customers', 'y_actual_count', 'y_pred_count'])],
                x='month', y='value', hue='variable', ax=ax1)
    ax2 = ax1.twinx()
    sns.lineplot(data=test_pred_mon_df_melt.loc[test_pred_mon_df_melt['variable'].isin(['precision_label1', 'recall_label1', 'f1_label1'])],
                 x='month', y='value', hue='variable', marker='o', ax=ax2)

    mlflow.log_figure(fig, 'metrics_by_months.png')

mlflow.end_run()

# SHAP

SHAP technique for feature importance

In [None]:
# I transform X train data from the fit pipeline above
X_train_transformed = pipeline[:-1].transform(X_train)

shap_values = shap.TreeExplainer(pipeline.named_steps['model']).shap_values(X_train_transformed)
shap.summary_plot(shap_values, X_train_transformed)

# Stages for tweaking previous steps

Some stages repeat steps from above multiple times
Not everything is rerun from this stage, it will depend on the changes in above steps and results

# Feature Selection

sklearn SelectFromModel

In [None]:
# feature selection technique SelectFromModel of sklearn is used here
# this is faster less gready approach
# but some other techniques have better results

sfm_selector = SelectFromModel(estimator=pipeline, threshold='median' )
sfm_selector.fit(X_train, y_train)

In [None]:
data_columns = pipeline.named_steps['drop_calc_date'].get_feature_names_out()
sel_features_bool = sfm_selector.get_support()

sel_features = [col for col, include in zip(data_columns, sel_features_bool) if include]

Feature engine RFA

In [None]:
# main disadvantage of the above method is that it removes features based on their importance
# that's why I'm using feature engine's Recursive Feature Addition/Elimination
# this technique can be aimed at target metric
# the features are removed or added depending on the treshold they have on target metric

# first I create custom metric, which is most appropriate for the task at hand
def custom_score(estimator, X, y):    
    y_pred = estimator.predict(X)    
    f1 = f1_score(y, y_pred, pos_label=1)
    print('label 1 f1 score:', f1)
    print(X.shape[1])
    return f1

In [None]:
# then I ran Recursive feature Addition/elimination CV
# I use threshold just above zero to keep almos all features having good effect on traget metric

#model from pipeline above is used

model = pipeline[-1]

# I tried both methods - addition and elimination
# despite the fact that RFE had slightly better effect on target metric, I choose RFA because of its speed

rfa = RecursiveFeatureAddition(estimator=model, cv=2, scoring=custom_score, threshold=0.0000000001)
rfa.fit_transform(X_train_transformed, y_train)

# rfe = RecursiveFeatureElimination(estimator=model, cv=2, scoring=custom_score, threshold=0.00001)
# rfe.fit_transform(X_train_transformed, y_train)

In [None]:
rfa_sel_cols_lst = [col for col in rfa.feature_names_in_ if col not in rfa.features_to_drop_]
rfa_sel_cols = []
for col in rfa_sel_cols_lst:
    rfa_sel_cols.append(int(col[1:]))

Features then are dropped from data collection step, where possible

# Resampling

In [None]:
# data is transformed up to the modeling stage, so that transformation steps are not repeated

X_train_transformed = pipeline[:-1].fit_transform(X_train, y_train)
X_val_transformed = pipeline[:-1].transform(X_val)

Resampling of the data

In [None]:
# Several resampling techniques were, but one which had better results and faster is kept

X_transformed_resampled, y_resampled = SMOTE(sampling_strategy=0.5,k_neighbors=20).fit_resample(X_train_transformed[:,rfa_sel_cols], y_train)
# X_transformed_resampled, y_resampled = SMOTEENN().fit_resample(X_train_transformed[:,rfa_sel_cols], y_train)
# X_transformed_resampled, y_resampled = ClusterCentroids(random_state=0).fit_resample(X_train_transformed, y_train)
# X_transformed_resampled, y_resampled = NearMiss(version=3).fit_resample(X_train_transformed, y_train)

Modeling with resampled data

In [None]:
# pipeline[-1].fit(X_transformed_resampled, y_resampled)
y_pred = pipeline[-1].predict(X_val_transformed)
y_pred_train = pipeline[-1].predict(X_transformed_resampled)

##classification report for test set
report = classification_report(y_val, y_pred)
print('Test Classification Report')
print(report)

##classification report for train set
report = classification_report(y_resampled, y_pred_train)
print('Train Classification Report')
print(report)

# Hyperparameter Tuning

Optuna Lightgbm Tuning

In [None]:
# here you can find code for HP tuning
# I've used several other methods - gridsearch, randomsearch - which are not shown here
# I use optuna library because it can tweak parameters by trying to optimaze target metric
# all runs are also tracked using mlflow tracking


def objective(trial):
    params = {
        'objective':'binary',
        'metric': 'binary_logloss',
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "verbosity": -1,
        "boosting_type": "gbdt",
        # "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "max_depth": trial.suggest_int("max_depth", -1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "min_gain_to_split": trial.suggest_float('min_gain_to_split',0.01,0.1),
        "lambda_l1": trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),      
    
    with mlflow.start_run(nested=True) as run:        
        model = lgb.LGBMClassifier(**params,random_state=0)
        ## Option 1: original data
        model.fit(X_train_transformed[:,rfa_sel_cols], y_train, verbose=False)
        ## Option 2: oversampled data
        # model.fit(X_transformed_resampled, y_resampled, verbose=False)           
        mlflow.lightgbm.log_model(lgb_model=model, artifact_path='model.pkl')

        predictions = model.predict(X_val_transformed[:,rfa_sel_cols])    
        val_f1_score = f1_score(y_val, predictions, pos_label=1) 
        val_precision_score = precision_score(y_val, predictions, pos_label=1)
        val_recall_score = recall_score(y_val, predictions, pos_label=1)

        ## Option 1: original data
        train_predictions = model.predict(X_train_transformed[:,rfa_sel_cols])    
        train_f1_score = f1_score(y_train, train_predictions, pos_label=1) 
        train_precision_score = precision_score(y_train, train_predictions, pos_label=1)
        train_recall_score = recall_score(y_train, train_predictions, pos_label=1)

        # Option 2: oversampled data
        # train_predictions = model.predict(X_transformed_resampled)    
        # train_f1_score = f1_score(y_resampled, train_predictions, pos_label=1) 
        # train_precision_score = precision_score(y_resampled, train_predictions, pos_label=1)
        # train_recall_score = recall_score(y_resampled, train_predictions, pos_label=1)

        metrics = {
            'val_precision_score': val_precision_score,
            'val_recall_score':val_recall_score,
            'val_f1_score':val_f1_score,
            'train_precision_score': train_precision_score,
            'train_recall_score': train_recall_score,
            'train_f1_score': train_f1_score
        }

        mlflow.log_metrics(metrics)
        
        return val_f1_score

In [None]:
# all runs can be checked in optuna dashboard in real time

study = optuna.create_study(direction='maximize', storage="sqlite:///db.sqlite3", study_name=run_name+'_trial18')
with mlflow.start_run(run_name=f'{run_name}_hp_optuna') as run:
    study.optimize(objective, n_trials=100)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)

In [None]:
study.best_trial.params