In [1]:
# hide
# default_exp tests.core.test_compose
from nbdev.showdoc import *
from dsblocks.utils.nbdev_utils import nbdev_setup, TestRunner

nbdev_setup ()
tst = TestRunner (targets=['dummy'])

# Test compose

In [2]:
# export
import pytest
import os
import joblib
from IPython.display import display
import pandas as pd
import numpy as np
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.utils import Bunch
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import KFold

from dsblocks.core.compose import *
from dsblocks.core.components import Component, PandasComponent, PickleSaverComponent
from dsblocks.core.utils import PickleIO
from dsblocks.utils.utils import remove_previous_results
from dsblocks.core.data_conversion import DataConverter, PandasConverter

import dsblocks.config.bt_defaults as dflt
from dsblocks.utils.utils import check_last_part

In [3]:
#export
@pytest.fixture (name='column_transformer_data')
def column_transformer_data_fixture():
    return column_transformer_data()

@pytest.fixture (name='multi_split_data')
def multi_split_data_fixture():
    return multi_split_data()

## Parallel

### `find_last_fitted_model`

#### Using Sequential

##### First test / example

In [4]:
# export
from dsblocks.utils.dummies import make_pipe_fit1

def test_pipeline_find_last_fitted_model_seq_others ():
    path_results = 'test_pipeline_find_last_fitted_model_seq_start'
    remove_previous_results (path_results=path_results)
    
    # pipelines
    pipe1 = make_pipe_fit1 ()
    X = np.array([1,2,3]).reshape(-1,1)
    r1 = pipe1.fit_apply (X)
    
    # case 1: component A
    pipe2 = make_pipe_fit1 (path_results=path_results, verbose=2)
    r = pipe2.A.fit_apply (X)
    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted
    pipe2.A.raise_error = True
    with pytest.raises (RuntimeError):
        r2 = pipe2.fit_apply (None)
    remove_previous_results (path_results=path_results)
    
    # case 2: component B
    pipe2 = make_pipe_fit1 (path_results=path_results, verbose=2)
    r = pipe2.A.fit_apply (X)
    r = pipe2.B.fit_apply (r)
    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted
    pipe2.A.raise_error = True
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    remove_previous_results (path_results=path_results)
    
    # case 2: component C
    pipe2 = make_pipe_fit1 (path_results=path_results, verbose=2)
    r = pipe2.A.fit_apply (X)
    r = pipe2.B.fit_apply (r)
    r = pipe2.C.fit_apply (r)
    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted
    pipe2.A.raise_error = True
    pipe2.B.raise_error = True
    pipe2.B.estimator = Bunch ()
    pipe2.C.estimator = Bunch ()
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    remove_previous_results (path_results=path_results)

In [5]:
tst.run (test_pipeline_find_last_fitted_model_seq_others, tag='dummy')

##### Second test / example

#### Using Parallel

In [6]:
# export
from dsblocks.utils.dummies import make_pipe_fit2

def test_pipeline_find_last_fitted_model_parallel_2 ():
    path_results = 'test_pipeline_find_last_fitted_model_parallel_2'
    remove_previous_results (path_results=path_results)
    
    # ******************************************************
    # pipelines
    pipe1 = make_pipe_fit2 ()
    X = np.array([1,2,3]).reshape(-1,1)
    r1 = pipe1.fit_apply (X)
    
    # ******************************************************
    pipe2 = make_pipe_fit2 (path_results=path_results, verbose=2, root=True)
    # second
    r = pipe2.A0.fit_apply (X)
    r = pipe2.A1.fit_apply (r)
    b1 = pipe2.obj.B1.fit_apply (r)
    b2 = pipe2.obj.B2.fit_apply (r)
    
    b3a = pipe2.obj.B3a.fit_apply (r)
    b3b = pipe2.obj.B3b.fit_apply (b3a)
    b3c = pipe2.obj.B3c.fit_apply (b3b)
    b3d = pipe2.obj.B3d.fit_apply (b3c)
    
    b4a = pipe2.obj.B4a.fit_apply (r)
    b4b = pipe2.obj.B4b.fit_apply (b4a)
    b4c = pipe2.obj.B4c.fit_apply (b4b)
    b4d = pipe2.obj.B4d.fit_apply (b4c)
    b4e = pipe2.obj.B4e.fit_apply (b4d)
    
    b5 = pipe2.obj.B5.fit_apply (r)

    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted

    pipe2.A0.raise_error = True
    pipe2.A1.raise_error = True
    
    pipe2.obj.B3a.raise_error = True
    pipe2.obj.B3b.raise_error = True
    pipe2.obj.B3c.raise_error = True
        
    pipe2.obj.B4a.raise_error = True
    pipe2.obj.B4b.raise_error = True
    pipe2.obj.B4c.raise_error = True
    pipe2.obj.B4d.raise_error = True
    
    pipe2.A1.create_estimator()
    pipe2.obj.B2.create_estimator()
    pipe2.obj.B3a.create_estimator()
    pipe2.obj.B3b.create_estimator()
    pipe2.obj.B4b.create_estimator()
    pipe2.obj.B4c.create_estimator()
    pipe2.obj.B4e.create_estimator()
    pipe2.obj.B5.create_estimator()
    
    pipe2.logger.info (f'\n{"-"*100}\n')
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    
    remove_previous_results (path_results=path_results)
    
    # ******************************************************
    
    pipe2.logger.info (f'\n{"*"*100}\n{"*"*100}\n')
    pipe2 = make_pipe_fit2 (path_results=path_results, verbose=2, root=True)
    # second
    r = pipe2.A0.fit_apply (X)
    r = pipe2.A1.fit_apply (r)
    b1 = pipe2.obj.B1.fit_apply (r)
    b2 = pipe2.obj.B2.fit_apply (r)
    
    b3a = pipe2.obj.B3a.fit_apply (r)
    b3b = pipe2.obj.B3b.fit_apply (b3a)
    b3c = pipe2.obj.B3c.fit_apply (b3b)
    b3d = pipe2.obj.B3d.fit_apply (b3c)
    
    b4 = pipe2.obj.pipeline_1_1.fit_apply (r)
    b5 = pipe2.obj.B5.fit_apply (r)

    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted

    pipe2.A0.raise_error = True
    pipe2.A1.raise_error = True
    
    pipe2.obj.B3a.raise_error = True
    pipe2.obj.B3b.raise_error = True
    pipe2.obj.B3c.raise_error = True
        
    pipe2.obj.B4a.raise_error = True
    pipe2.obj.B4b.raise_error = True
    pipe2.obj.B4c.raise_error = True
    pipe2.obj.B4d.raise_error = True
    
    pipe2.A1.create_estimator()
    pipe2.obj.B2.create_estimator()
    pipe2.obj.B3a.create_estimator()
    pipe2.obj.B3b.create_estimator()
    pipe2.obj.B4b.create_estimator()
    pipe2.obj.B4c.create_estimator()
    pipe2.obj.B4e.create_estimator()
    pipe2.obj.B5.create_estimator()
    
    pipe2.logger.info (f'\n{"-"*100}\n')
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    
    remove_previous_results (path_results=path_results)
    
    # ******************************************************
    
    pipe2.logger.info (f'\n{"*"*100}\n{"*"*100}\n')
    pipe2 = make_pipe_fit2 (path_results=path_results, verbose=2, root=True)
    # second
    r = pipe2.A0.fit_apply (X)
    r = pipe2.A1.fit_apply (r)
    r = pipe2.obj.parallel.fit_apply (r)

    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted

    pipe2.A0.raise_error = True
    pipe2.A1.raise_error = True
    
    pipe2.obj.B1.raise_error = True
    pipe2.obj.B2.raise_error = True
    
    pipe2.obj.B3a.raise_error = True
    pipe2.obj.B3b.raise_error = True
    pipe2.obj.B3c.raise_error = True
    pipe2.obj.B3d.raise_error = True
        
    pipe2.obj.B4a.raise_error = True
    pipe2.obj.B4b.raise_error = True
    pipe2.obj.B4c.raise_error = True
    pipe2.obj.B4d.raise_error = True
    pipe2.obj.B4e.raise_error = True
    
    pipe2.obj.B5.raise_error = True
    
    pipe2.A1.create_estimator()
    pipe2.obj.B2.create_estimator()
    pipe2.obj.B3a.create_estimator()
    pipe2.obj.B3b.create_estimator()
    pipe2.obj.B4b.create_estimator()
    pipe2.obj.B4c.create_estimator()
    pipe2.obj.B4e.create_estimator()
    pipe2.obj.B5.create_estimator()
    
    pipe2.logger.info (f'\n{"-"*100}\n')
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    
    remove_previous_results (path_results=path_results)
    
    # ******************************************************
    pipe2.logger.info (f'\n{"*"*100}\n{"*"*100}\n')
    pipe2 = make_pipe_fit2 (path_results=path_results, verbose=2, root=True, new_parallel=True)
    # second
    r = pipe2.A0.fit_apply (X)
    r = pipe2.A1.fit_apply (r)
    r = pipe2.obj.new_parallel.fit_apply (r)
    r = pipe2.C.fit_apply (r)

    all_fitted = pipe2.find_last_fitted_model ()
    assert not all_fitted

    pipe2.A0.raise_error = True
    pipe2.A1.raise_error = True
    
    pipe2.obj.B1.raise_error = True
    pipe2.obj.B2.raise_error = True
    
    pipe2.obj.B3a.raise_error = True
    pipe2.obj.B3b.raise_error = True
    pipe2.obj.B3c.raise_error = True
    pipe2.obj.B3d.raise_error = True
        
    pipe2.obj.B4a.raise_error = True
    pipe2.obj.B4b.raise_error = True
    pipe2.obj.B4c.raise_error = True
    pipe2.obj.B4d.raise_error = True
    pipe2.obj.B4e.raise_error = True
    
    pipe2.obj.B5.raise_error = True
    
    pipe2.new_parallel.raise_error = True
    
    pipe2.A1.create_estimator()
    pipe2.obj.B2.create_estimator()
    pipe2.obj.B3a.create_estimator()
    pipe2.obj.B3b.create_estimator()
    pipe2.obj.B4b.create_estimator()
    pipe2.obj.B4c.create_estimator()
    pipe2.obj.B4e.create_estimator()
    pipe2.obj.B5.create_estimator()
    
    pipe2.logger.info (f'\n{"-"*100}\n')
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    
    remove_previous_results (path_results=path_results)
    
    # ******************************************************
    pipe2.logger.info (f'\n{"*"*100}\n{"*"*100}\n')
    pipe2 = make_pipe_fit2 (path_results=path_results, verbose=2, root=True, new_parallel=True)
    # second
    r = pipe2.A0.fit_apply (X)
    r = pipe2.A1.fit_apply (r)
    r = pipe2.obj.new_parallel.fit_apply (r)
    r = pipe2.C.fit_apply (r)
    r = pipe2.D.fit_apply (r)

    all_fitted = pipe2.find_last_fitted_model ()
    assert all_fitted

    pipe2.A0.raise_error = True
    pipe2.A1.raise_error = True
    
    pipe2.obj.B1.raise_error = True
    pipe2.obj.B2.raise_error = True
    
    pipe2.obj.B3a.raise_error = True
    pipe2.obj.B3b.raise_error = True
    pipe2.obj.B3c.raise_error = True
    pipe2.obj.B3d.raise_error = True
        
    pipe2.obj.B4a.raise_error = True
    pipe2.obj.B4b.raise_error = True
    pipe2.obj.B4c.raise_error = True
    pipe2.obj.B4d.raise_error = True
    pipe2.obj.B4e.raise_error = True
    
    pipe2.obj.B5.raise_error = True
    
    pipe2.new_parallel.raise_error = True
    pipe2.C.raise_error = True
    
    pipe2.A1.create_estimator()
    pipe2.obj.B2.create_estimator()
    pipe2.obj.B3a.create_estimator()
    pipe2.obj.B3b.create_estimator()
    pipe2.obj.B4b.create_estimator()
    pipe2.obj.B4c.create_estimator()
    pipe2.obj.B4e.create_estimator()
    pipe2.obj.B5.create_estimator()
    
    pipe2.D.create_estimator()
    
    pipe2.logger.info (f'\n{"-"*100}\n')
    r2 = pipe2.fit_apply (None)
    assert (r1==r2).all()
    
    remove_previous_results (path_results=path_results)

In [7]:
tst.run (test_pipeline_find_last_fitted_model_parallel_2, tag='dummy')

## Data conversion

In [24]:
#export
from dsblocks.utils.dummies import (DataSource, SumXY, MaxOfPositiveWithSeparateLabels, Sum1direct,
                                       Multiply10direct, subtract_xy, MinOfPositiveWithoutSeparateLabels)
def test_data_conversion_sequential_parallel_column_transformer ():
    class MinDC (PandasConverter):
        def convert_before_fitting (self, *X):
            df, label = X
            df=df.copy()
            df['label']=label
            return (df,)

        def convert_before_transforming (self, *X, **kwargs):
            df, label = X
            df=df.copy()
            df['label']=label
            return super().convert_before_transforming (df)

    pipe = Sequential (DataSource (convert_after=lambda x: (x[0],x[1],np.array(x[2]))),
                       SumXY (data_converter='GenericConverter'),
                       Component(apply=lambda X: X*2, data_converter='GenericConverter'),
                       MaxOfPositiveWithSeparateLabels (data_converter='GenericConverter'),
                       Parallel (Sum1direct (data_converter='GenericConverter'), 
                                 Multiply10direct (data_converter='GenericConverter'),
                                 finalize_result=lambda x: (x[0][0], x[1][0]),
                                 data_converter='GenericConverter'),
                       Component(apply=subtract_xy, data_converter='GenericConverter'),
                       MinOfPositiveWithoutSeparateLabels (data_converter=MinDC),
                       make_column_transformer ((Multiply10direct (data_converter='GenericConverter'), ['a','b']), 
                                                (Sum1direct (data_converter='GenericConverter'), ['c','d'])),
                       Sum1direct (data_converter='GenericConverter'))

    x = pipe.fit_apply()

    print (x)

In [25]:
tst.run (test_data_conversion_sequential_parallel_column_transformer, tag='dummy')

running test_data_conversion_sequential_parallel_column_transformer
        a       b      c       d
0  -80349 -211029 -32261 -104135
1 -116349 -247029 -52061 -123935
2 -152349 -283029 -71861 -143735
3 -188349 -319029 -91661 -163535
