In [1]:
import pandas as pd
from piex import explorer

In [2]:
piex = explorer.PipelineExplorer('ml-pipelines-2018')

In [3]:
df = piex.get_pipelines(data_modality='single_table', task_type='classification')

In [156]:
def get_best(k):
    def f(gdf):
        return gdf.sort_values('rank').iloc[0:k]
    
    return f
    
bdf = df.groupby('dataset').apply(get_best(10))

In [157]:
bdf.shape

(2340, 11)

In [158]:
bdf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,data_modality,dataset,metric,name,rank,score,task_type,template,test_id,pipeline
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1491_one_hundred_plants_dataset_TRAIN,1815222,febd5b88-435f-440f-bb44-d7c114a3ce07,single_table,1491_one_hundred_plants_dataset_TRAIN,f1Macro,dfs/categorical_encoder/imputer/standard_scale...,0.22091,0.77909,classification,5bd106fb49e71569e8bf8071,20181025043231171172,dfs/categorical_encoder/imputer/standard_scale...
1491_one_hundred_plants_dataset_TRAIN,1510781,f9aa8d58-3307-4f54-8569-871d4a02c9c6,single_table,1491_one_hundred_plants_dataset_TRAIN,f1Macro,categorical_encoder/imputer/standard_scaler/ra...,0.258131,0.741869,classification,5bd106fb49e71569e8bf806f,20181025042919337776,categorical_encoder/imputer/standard_scaler/ra...
1491_one_hundred_plants_dataset_TRAIN,1684953,3f8be58d-aef5-4d9d-9dc3-9a4e3906eb1b,single_table,1491_one_hundred_plants_dataset_TRAIN,f1Macro,categorical_encoder/imputer/standard_scaler/ra...,0.263567,0.736433,classification,5bd106fb49e71569e8bf806f,20181025042919337776,categorical_encoder/imputer/standard_scaler/ra...
1491_one_hundred_plants_dataset_TRAIN,1673272,50148c3a-3eb4-4846-8e20-83dd9ef2e776,single_table,1491_one_hundred_plants_dataset_TRAIN,f1Macro,categorical_encoder/imputer/standard_scaler/ra...,0.268612,0.731388,classification,5bd106fb49e71569e8bf806f,20181025042919337776,categorical_encoder/imputer/standard_scaler/ra...
1491_one_hundred_plants_dataset_TRAIN,1794742,f576467c-8f08-4cbc-ac3e-a0e609c09b5a,single_table,1491_one_hundred_plants_dataset_TRAIN,f1Macro,dfs/categorical_encoder/imputer/standard_scale...,0.290822,0.709178,classification,5bd106fb49e71569e8bf8071,20181025043231171172,dfs/categorical_encoder/imputer/standard_scale...


In [160]:
from tqdm import tqdm

tqdm.pandas(desc="Progress")

In [None]:
best_pipelines = bdf['_id'].progress_apply(lambda i: piex.get_json('pipelines', i))

Progress:  33%|███▎      | 778/2340 [06:27<12:56,  2.01it/s]

In [162]:
len(best_pipelines)

2340

In [23]:
best_pipeline = best_pipelines.iloc[0]

In [163]:
bpdf = pd.DataFrame(list(best_pipelines.values))

In [63]:
def extract_hyperparameters(pdf):
    pdf = pdf.copy()
    hyperparameters = dict()
    for primitive, hp in pdf.pop('hyperparameters').items():
        for key, value in hp.items():
            hyperparameters[primitive + '#' + key] = value

    hyperparameters['hyperparameter_names'] = list(hyperparameters.keys())
    hyperparameters['template'] = pdf['template']
    return pd.Series(hyperparameters)

In [64]:
hpdf = bpdf.apply(extract_hyperparameters, axis=1)

In [114]:
hpdf.head().T

Unnamed: 0,0,1,2,3,4
featuretools.dfs#1#encode,True,True,,,True
featuretools.dfs#1#max_depth,1,1,,,2
featuretools.dfs#1#remove_low_information,True,True,,,False
hyperparameter_names,"[featuretools.dfs#1#encode, featuretools.dfs#1...","[featuretools.dfs#1#encode, featuretools.dfs#1...",[mlprimitives.feature_extraction.CategoricalEn...,[mlprimitives.feature_extraction.CategoricalEn...,"[featuretools.dfs#1#encode, featuretools.dfs#1..."
mlprimitives.feature_extraction.CategoricalEncoder#1#copy,True,True,True,True,True
mlprimitives.feature_extraction.CategoricalEncoder#1#features,auto,auto,auto,auto,auto
mlprimitives.feature_extraction.CategoricalEncoder#1#max_labels,83,48,26,24,65
sklearn.ensemble.RandomForestClassifier#1#class_weight,balanced,,,balanced,balanced
sklearn.ensemble.RandomForestClassifier#1#criterion,entropy,,,entropy,entropy
sklearn.ensemble.RandomForestClassifier#1#max_depth,14,,,23,26


In [122]:
def build_template(hpdf, bpdf, template_id):
    hpdf = hpdf[hpdf.template == template_id]
    pipeline = bpdf[bpdf['template'] == template_id].iloc[0]
    
    init_parameters = pipeline['init_params']
    hyperparameters = dict()
    for primitive, parameter in pipeline['tunable_hyperparameters'].items():
        primitive_init_parameters = init_parameters.get(primitive, dict())
        primitive_hyperparameters = dict()
        hyperparameters[primitive] = primitive_hyperparameters
        
        for name, spec in parameter.items():
            key = primitive + '#' + name
            param_type = spec['type']

            if len(hpdf[key].unique()) == 1:
                primitive_init_params[name] = hpdf[key].mode()[0]
                
            elif param_type in ('bool', 'str'):
                hyperparameter = {
                    'type': param_type,
                    'default': hpdf[key].mode()[0]
                }
                if param_type != 'bool':
                    hyperparameter['values'] = list(hpdf[key].unique())

                primitive_hyperparameters[name] = hyperparameter
                    
            elif param_type == 'float':
                std = hpdf[key].std()
                primitive_hyperparameters[name] = {
                    'type': param_type,
                    'range': [
                        max(hpdf[key].min() - std, spec['range'][0]),
                        min(hpdf[key].max() + std, spec['range'][1])
                    ],
                    'default': hpdf[key].mean()
                }
                
            elif param_type == 'int':
                primitive_hyperparameters[name] = {
                    'type': param_type,
                    'range': [hpdf[key].min(), hpdf[key].max()],
                    'default': int(hpdf[key].mean())
                }

            
    metadata = pipeline['loader']
    metadata['name'] = pipeline['name']
    return {
        'metadata': metadata,
        'init_params': init_parameters,
        'input_name': pipeline['input_names'],
        'output_name': pipeline['output_names'],
        'primitives': pipeline['primitives'],
        'tunable_hyperparameters': hyperparameters
    }

template = build_template(hpdf, bpdf, '5bd106fb49e71569e8bf8071')

In [123]:
def within_ranges(pipeline, template):
    pipeline_hyperparameters = pipeline['hyperparameters']
    template_init_params = template['init_params']
    for primitive, tunables in template['tunable_hyperparameters'].items():
        init_params = template_init_params.get(primitive, dict())
        for name, value in pipeline_hyperparameters[primitive].items():
            tunable = tunables.get(name)
            if tunable:
                tunable_type = tunable['type']
                if tunable_type == 'str' and value not in tunable['values']:
                    return False
                
                elif tunable_type in ('int', 'float'):
                    tunable_range = tunable['range']
                    if not (tunable_range[0] <= value <= tunable_range[1]):
                        return False
                    
            elif name in init_params:
                if init_params[name] != value:
                    return False

    return True

pipeline = best_pipelines[0]
template = build_template(hpdf, bpdf, pipeline['template'])
within_ranges(pipeline, template)

True

In [164]:
def split_datasets(df):
    datasets = pd.Series(df.dataset.unique())
    datasets = datasets.sample(len(datasets))
    half = int(len(datasets) / 2)

    first_half = df[df.dataset.isin(datasets[:half])]
    second_half = df[df.dataset.isin(datasets[half:])]

    return first_half, second_half

bpdf_1, bpdf_2 = split_datasets(bpdf)

In [165]:
bpdf_1.shape, bpdf_2.shape

((1170, 17), (1170, 17))

In [166]:
hpdf_1 = bpdf_1.apply(extract_hyperparameters, axis=1)

In [167]:
hpdf_2 = bpdf_2.apply(extract_hyperparameters, axis=1)

In [168]:
hpdf_1.shape, hpdf_2.shape

((1170, 28), (1170, 28))

In [169]:
template_ids = hpdf_1.template.unique()

In [170]:
templates = dict()
for template_id in template_ids:
    templates[template_id] = build_template(hpdf_1, bpdf_1, template_id)

In [171]:
len(template_ids)

4

In [193]:
import numpy as np

def pipelines_within_ranges(pipelines):
    within = list()
    for pipeline in pipelines.to_dict(orient='records'):
        is_within = within_ranges(
            pipeline,
            templates[pipeline['template']]
        )
        within.append(is_within)
    
    return np.array(within)

def within_ranges_ratio(pipelines):
    return pipelines_within_ranges(pipelines).mean()

def any_within_ranges(pipelines):
    return pipelines_within_ranges(pipelines).any()

def all_within_ranges(pipelines):
    return pipelines_within_ranges(pipelines).all()

def best_within_ranges(pipelines):
    best = pipelines.sort_values('rank').iloc[0]
    return within_ranges(best, templates[best['template']])

In [194]:
within_stats = {
    'any_within': bpdf_2.groupby('dataset').apply(any_within_ranges),
    'all_within': bpdf_2.groupby('dataset').apply(all_within_ranges),
    'within_ratio': bpdf_2.groupby('dataset').apply(within_ranges_ratio),
    'best_within': bpdf_2.groupby('dataset').apply(best_within_ranges),
}

In [197]:
wdf = pd.DataFrame(within_stats)

In [200]:
wdf.head()

Unnamed: 0_level_0,any_within,all_within,within_ratio,best_within
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
185_baseball_dataset_TRAIN,True,True,1.0,True
27_ws_dataset_TRAIN,True,True,1.0,True
313_spectrometer_dataset_TRAIN,True,True,1.0,True
38_sick_dataset_TRAIN,True,True,1.0,True
57_hd_dataset_TRAIN,True,True,1.0,True


In [199]:
wdf.mean()

any_within      1.000000
all_within      0.905983
within_ratio    0.988889
best_within     0.974359
dtype: float64