In [1]:
import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType

import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
from data_config_search.data_transform_utils import fill_missing_values, assign_attr_types, add_NA_flags, one_hot_encode
from data_config_search.data_transform_utils import add_logic_features, add_poly, add_spline, add_loading_features, add_group_features
from data_config_search.data_transform_utils import scale, feature_selection

## Get run parameters

In [43]:
client = MlflowClient(tracking_uri="sqlite:///../mlflow_data/mlflow.db")
# client.list_experiments()

model_run = client.search_runs(
    experiment_ids='3',
    filter_string="metrics.auroc > 59.0 and parameters.run_id = 'b9a1484b322e48cf8fcbc070346b41db'",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.auroc DESC"]
)[0]

# model_run = client.get_run('14bced548d8945148e94884cf7a5a220')
data_run = client.get_run(model_run.data.params['run_id'])

In [44]:
def str_to_bool(x:str):
    if x == 'False': return False
    elif x == 'True': return True
    else: raise ValueError('Only "True" and "False" allowed')

In [45]:
# Read data parameters
seeds = [int(x) for x in data_run.data.params['seed'][1:-1].replace(',', '').split()]
imputer_params = {'method': data_run.data.params['imputer_method'], 'by_product_code': data_run.data.params['imputer_by_prod_code']}
attr2_type = data_run.data.params['attr2_type']
attr3_type = data_run.data.params['attr3_type']
group_dict = {'use': str_to_bool(data_run.data.params['group_features_used'])}
poly_dict = {'use': str_to_bool(data_run.data.params['poly_features_used']), 'degree': int(data_run.data.params['poly_degree'])}
spline_dict = {'use': str_to_bool(data_run.data.params['spline_params_used']), 'n_knots': int(data_run.data.params['spline_n_knots']), 'degree': int(data_run.data.params['spline_degree'])}
logic_dict = {'use': str_to_bool(data_run.data.params['logic_features_used'])}
loading_dict = {'use': str_to_bool(data_run.data.params['loading_features_used'])}
scaling_dict = {'use': str_to_bool(data_run.data.params['scaling_used']), 'method': data_run.data.params['scaling_method']}
fselection_steps = {'Constant_Features': {'frac_constant_values': 0.99}}
stepwise = str_to_bool(data_run.data.params['fs_stepwise_used'])
n_features_found = data_run.data.params['n_features_used']

# Read model parameters
C = float(model_run.data.params['C'])
max_iter = int(model_run.data.params['max_iter'])

## Create datasets

In [46]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
len_train = len(train)

data = pd.concat([train.drop(columns=['failure']), test], axis=0)

data = (data
    .pipe(add_NA_flags, cols=['measurement_3', 'measurement_5'])
    .pipe(fill_missing_values, params=imputer_params, extra_params={'seed':seeds[0], 'n_knn':3})
    .pipe(assign_attr_types, attr_types={'attribute_2':attr2_type, 'attribute_3':attr3_type})
    .pipe(add_group_features, group_dict=group_dict)
    .pipe(one_hot_encode)
    .pipe(add_poly, poly_dict=poly_dict)
    .pipe(add_spline, spline_dict=spline_dict)
    .pipe(add_logic_features, logic_dict=logic_dict)
    .pipe(add_loading_features, loading_dict=loading_dict)
    .pipe(scale, scale_dict=scaling_dict)
    .pipe(feature_selection, y=train['failure'], steps=fselection_steps, stepwise=stepwise, seed=seeds[0])
    )

In [48]:
print(f'N features from data_config_search: {n_features_found}')
print(f'N features from rerun: {data.shape[1]}')

N features from data_config_search: 10
N features from rerun: 10


Create train test split and check if all ok

In [49]:
train_ = pd.concat([data[:len_train], train['failure']], axis=1)
test_ = data[len_train:]

print(train.product_code.unique())
print(test.product_code.unique())

['A' 'B' 'C' 'D' 'E']
['F' 'G' 'H' 'I']


## Train model

In [50]:
X_train = train_.drop(columns=['product_code', 'failure'])
X_test = test_.drop(columns=['product_code'])
y_train = train_['failure']

In [52]:
model = LogisticRegression(max_iter=max_iter, C=C)
model.fit(X_train, y_train)

# Train error
y_pred_train = model.predict_proba(X_train)[:,1]
print(f'Train error :{roc_auc_score(y_train, y_pred_train)}')

Train error :0.5936324926123512


## Submit predictions

In [53]:
y_pred = model.predict_proba(X_test)[:,1]

In [54]:
pred_df = pd.DataFrame(zip(list(X_test.index), y_pred), columns=['id', 'failure'])
pred_df.to_csv('../submissions/submission_SPLINE.csv', index=False)