In [1]:
import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import catboost as cb

In [2]:
import sys
import os.path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from data_config_search.data_transform_utils import fill_missing_values, assign_attr_types, add_NA_flags, one_hot_encode
from data_config_search.data_transform_utils import add_logic_features, add_poly, add_spline, add_loading_features, add_group_features
from data_config_search.data_transform_utils import scale, feature_selection

## Get run parameters

In [39]:
client = MlflowClient(tracking_uri="sqlite:///../../mlflow_data/mlflow.db")
# client.list_experiments()

run = client.search_runs(
    experiment_ids='2',
    filter_string="metrics.auroc > 59.0",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.auroc DESC"]
)[0]

# Or
run = client.get_run('b9a1484b322e48cf8fcbc070346b41db')

In [40]:
def str_to_bool(x:str):
    if x == 'False': return False
    elif x == 'True': return True
    else: raise ValueError('Only "True" and "False" allowed')

In [41]:
# Read all parameters
seeds = [int(x) for x in run.data.params['seed'][1:-1].replace(',', '').split()]
imputer_params = {'method': run.data.params['imputer_method'], 'by_product_code': run.data.params['imputer_by_prod_code']}
attr2_type = run.data.params['attr2_type']
attr3_type = run.data.params['attr3_type']
group_dict = {'use': str_to_bool(run.data.params['group_features_used'])}
poly_dict = {'use': str_to_bool(run.data.params['poly_features_used']), 'degree': int(run.data.params['poly_degree'])}
spline_dict = {'use': str_to_bool(run.data.params['spline_params_used']), 'n_knots': int(run.data.params['spline_n_knots']), 'degree': int(run.data.params['spline_degree'])}
logic_dict = {'use': str_to_bool(run.data.params['logic_features_used'])}
loading_dict = {'use': str_to_bool(run.data.params['loading_features_used'])}
scaling_dict = {'use': str_to_bool(run.data.params['scaling_used']), 'method': run.data.params['scaling_method']}
fselection_steps = {'Constant_Features': {'frac_constant_values': 0.99}}
stepwise = str_to_bool(run.data.params['fs_stepwise_used'])
n_features_found = run.data.params['n_features_used']

## Replicate dataset from run

In [42]:
train = pd.read_csv('../../data/train.csv', index_col='id')
data = train.drop(columns=['failure'])

data = (data
    .pipe(add_NA_flags, cols=['measurement_3', 'measurement_5'])
    .pipe(fill_missing_values, params=imputer_params, extra_params={'seed':seeds[0], 'n_knn':3})
    .pipe(assign_attr_types, attr_types={'attribute_2':attr2_type, 'attribute_3':attr3_type})
    .pipe(add_group_features, group_dict=group_dict)
    .pipe(one_hot_encode)
    .pipe(add_poly, poly_dict=poly_dict)
    .pipe(add_spline, spline_dict=spline_dict)
    .pipe(add_logic_features, logic_dict=logic_dict)
    .pipe(add_loading_features, loading_dict=loading_dict)
    .pipe(scale, scale_dict=scaling_dict)
    .pipe(feature_selection, y=train['failure'], steps=fselection_steps, stepwise=stepwise, seed=seeds[0])
    )

print(f'N features from data_config_search: {n_features_found}')
print(f'N features from rerun: {data.shape[1]}')

## Train baseline model as benchmark

In [46]:
model_seeds = [4,66,128]
auroc_dict = {'auroc_fold_A': 0, 'auroc_fold_B': 0, 'auroc_fold_C': 0, 'auroc_fold_D': 0, 'auroc_fold_E': 0}
accuracy_list = []

for model_seed in model_seeds:

    # Use 5-fold split
    kfold = GroupKFold(n_splits=5)
    X = data
    y = train['failure']
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y, train['product_code'])):
            
        X_train = X.loc[train_idx]
        X_val = X.loc[val_idx]
        y_train = y.loc[train_idx]
        y_val = y.loc[val_idx]

        val_prod_code = X_val['product_code'].unique()[0]
        X_train = X_train.drop(columns=['product_code'])
        X_val = X_val.drop(columns=['product_code'])

        # model_ = model_dict[model]
        model_ = LogisticRegression(random_state=model_seed, max_iter=200)
        model_.fit(X_train, y_train)
        y_pred = model_.predict_proba(X_val)[:,1]

        auroc_dict[f'auroc_fold_{val_prod_code}'] += roc_auc_score(y_val, y_pred)
        accuracy_list.append(np.mean((y_pred >= 0.5).astype('int') == y_val))

auroc_dict = {k:v/len(model_seeds) for k,v in auroc_dict.items()}
acc_ = np.mean(accuracy_list)*100
auroc_ = np.mean(list(auroc_dict.values()))*100
auroc_dict_ = {k:v*100 for k,v in auroc_dict.items()}

print(f'Accuracy: {acc_}')
print(f'AUROC: {auroc_}')
print(f'AUROC dict: {auroc_dict_}')

Accuracy: 78.72083259424164
AUROC: 59.27206188927491
AUROC dict: {'auroc_fold_A': 59.478778768544394, 'auroc_fold_B': 59.42314555002655, 'auroc_fold_C': 58.79238579595665, 'auroc_fold_D': 60.05598021582733, 'auroc_fold_E': 58.61001911601959}


## Tune model

In [47]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [48]:
mlflow.set_tracking_uri("sqlite:///../../mlflow_data/mlflow.db")
mlflow.set_experiment("Kaggle-TPSAUG2022-model-config")

<Experiment: artifact_location='./mlruns/3', experiment_id='3', lifecycle_stage='active', name='Kaggle-TPSAUG2022-model-config', tags={}>

In [49]:
def objective(params):

    auroc_dict = {'auroc_fold_A': 0, 'auroc_fold_B': 0, 'auroc_fold_C': 0, 'auroc_fold_D': 0, 'auroc_fold_E': 0}

    with mlflow.start_run():
        for model_seed in model_seeds:

            # Use 5-fold split
            kfold = GroupKFold(n_splits=5)
            X = data
            y = train['failure']
            for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y, train['product_code'])):
                    
                X_train = X.loc[train_idx]
                X_val = X.loc[val_idx]
                y_train = y.loc[train_idx]
                y_val = y.loc[val_idx]

                val_prod_code = X_val['product_code'].unique()[0]
                X_train = X_train.drop(columns=['product_code'])
                X_val = X_val.drop(columns=['product_code'])

                # model_ = model_dict[model]
                model_ = LogisticRegression(**params, random_state=model_seed)
                model_.fit(X_train, y_train)
                y_pred = model_.predict_proba(X_val)[:,1]

                auroc_dict[f'auroc_fold_{val_prod_code}'] += roc_auc_score(y_val, y_pred)

        auroc_dict = {k:v/len(model_seeds) for k,v in auroc_dict.items()}
        auroc_ = np.mean(list(auroc_dict.values()))*100
        auroc_dict_ = {k:v*100 for k,v in auroc_dict.items()}

        mlflow.set_tag('iteration', '5')
        mlflow.log_param('run_id', run.info.run_id)
        mlflow.log_params(params)
        mlflow.log_metric('auroc', auroc_)
        mlflow.log_metrics(auroc_dict_)

    return {'loss': -auroc_, 'status': STATUS_OK}

In [50]:
search_space = {
    'max_iter': scope.int(hp.quniform('max_iter', 200, 500, 1)),
    'C': hp.quniform('C', 0.01, 2, 0.01)
    # 'l1_ratio': hp.quniform('l1_ratio', 0, 1, 1),
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=200,
    trials=Trials()
)

100%|██████████| 200/200 [01:05<00:00,  3.07trial/s, best loss: -59.27941022249696]


In [51]:
best_result

{'C': 0.02, 'max_iter': 427.0}